started: 01Mar2016 last updated: 02Jun2016
# Start time
Sys.time()
## [1] "2016-06-02 21:34:33 BST"
# Clean-up
rm(list=ls())
graphics.off()
# Set root working folder
library(knitr)
opts_knit$set(root.dir = "/scratch/medgen/scripts/rscripts_05.16")
#setwd("/scratch/medgen/scripts/rscripts_05.16")
#opts_knit$set(root.dir = "C:\\Users\\larion01\\Documents\\GitHub\\R1")
#setwd("C:\\Users\\larion01\\Documents\\GitHub\\R1")
# Required libraries
library(tidyr) # for separate
library(dplyr) # for piping, filter, select etc
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr) # for str_replace_all
library(ggplot2) # for some plots
load(file="data/s04_filter_by_effect_feb2016.RData")
ls()
## [1] "alt.rel" "alt.std" "alt.str"
## [4] "covar.df" "demographics.df" "dp.rel"
## [7] "dp.std" "dp.str" "gq.rel"
## [10] "gq.std" "gq.str" "gt.rel"
## [13] "gt.std" "gt.str" "ref.rel"
## [16] "ref.std" "ref.str" "samples.df"
## [19] "vv.rel" "vv.std" "vv.str"
vv.df <- vv.std
gt.mx <- gt.std
rm(vv.std, gt.std, gq.std, dp.std, ref.std, alt.std)
rm(vv.str, gt.str, gq.str, dp.str, ref.str, alt.str)
rm(vv.rel, gt.rel, gq.rel, dp.rel, ref.rel, alt.rel)
ls()
## [1] "covar.df" "demographics.df" "gt.mx" "samples.df"
## [5] "vv.df"
# I do NOT convert_dfs_to_tables because it loses row manes.
# However, it could be considered later,
# when row names handling in tables is fixed.
dim(covar.df)
## [1] 498 34
str(covar.df)
## 'data.frame': 498 obs. of 34 variables:
## $ Subject_ID : int 200054 200491 200565 200698 200958 201046 201558 201921 202026 202236 ...
## $ setno : int 382125 204356 360683 226881 357431 374980 201558 201921 213991 385058 ...
## $ cc : int 0 0 0 0 0 0 1 1 0 0 ...
## $ chemo : int 1 1 0 0 1 1 1 1 1 1 ...
## $ hormone : int 0 1 1 0 1 0 0 1 1 0 ...
## $ chemo_hormone : Factor w/ 5 levels "","both","chem",..: 3 2 4 5 2 3 3 2 2 3 ...
## $ chemo_self_mra : int 1 1 0 0 1 1 1 1 1 1 ...
## $ hormone_self_mra: int 0 1 1 0 1 0 0 1 1 0 ...
## $ treatment : int 1 1 1 0 1 1 1 1 1 1 ...
## $ ID : int 2 6 7 9 11 12 16 22 24 26 ...
## $ labid : Factor w/ 498 levels "id200054","id200491",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ status : int 0 0 0 0 0 0 1 1 0 0 ...
## $ status2 : int 0 0 0 0 0 0 1 1 0 0 ...
## $ offset : num 6.41 5.82 5.61 4.03 4.77 ...
## $ sub_dx_age : int 46 50 46 44 43 39 45 40 43 36 ...
## $ XRTBreast : int 0 0 0 1 0 1 0 0 1 0 ...
## $ Eigen_1 : num -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
## $ Eigen_2 : num 0.0055 0.00414 0.00302 0.00301 0.0023 ...
## $ Eigen_3 : num -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
## $ Eigen_4 : num 0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
## $ Eigen_5 : num 0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
## $ dose : Factor w/ 3 levels "ge 1Gy","ls 1Gy",..: 3 3 3 2 3 2 3 3 2 3 ...
## $ dsmiss : int 0 0 0 0 1 0 0 0 0 0 ...
## $ good_location : int 1 1 0 1 1 1 0 1 0 0 ...
## $ Deleterious : int 0 0 0 0 0 0 0 0 0 0 ...
## $ registry : Factor w/ 5 levels "de","IA","IR",..: 1 3 3 5 5 4 3 2 2 1 ...
## $ race : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age_stratum1 : Factor w/ 5 levels "20to34","35to39",..: 4 4 4 3 3 2 3 3 3 2 ...
## $ dxyr_stratum : int 2 2 3 1 1 2 1 2 3 2 ...
## $ CMF_Only : int 1 0 0 0 0 1 0 0 1 1 ...
## $ family_history : int 0 0 0 0 0 0 1 0 0 0 ...
## $ sub_dx_age_cat : int 0 0 0 1 1 1 0 1 1 1 ...
## $ CMF : Factor w/ 3 levels "CMF","no","Oth": 1 3 2 2 3 1 3 3 1 1 ...
## $ XRTBrCHAR : int 0 0 0 1 0 1 0 0 1 0 ...
covar.df[1:5,1:5]
## Subject_ID setno cc chemo hormone
## 1 200054 382125 0 1 0
## 2 200491 204356 0 1 1
## 3 200565 360683 0 0 1
## 4 200698 226881 0 0 0
## 5 200958 357431 0 1 1
dim(demographics.df)
## [1] 505 91
str(demographics.df)
## 'data.frame': 505 obs. of 91 variables:
## $ Subject_ID : Factor w/ 503 levels "200054","200491",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ ID.x : num 2 6 7 9 11 12 NA NA 24 26 ...
## $ labid.x : Factor w/ 498 levels "15582015","19212019",..: 6 7 8 9 10 11 NA NA 12 13 ...
## $ Eigen_1.x : num -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
## $ Eigen_2.x : num 0.0055 0.00414 0.00302 0.00301 0.0023 ...
## $ Eigen_3.x : num -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
## $ Eigen_4.x : num 0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
## $ Eigen_5.x : num 0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
## $ Phase : num 1 1 1 1 1 1 NA NA 1 1 ...
## $ setno.x : num 382125 204356 360683 226881 357431 ...
## $ cc.x : int 0 0 0 0 0 0 NA NA 0 0 ...
## $ rstime : num 7.42 9.75 6.09 6.25 9.34 7 NA NA 6.09 8.66 ...
## $ registry.x : Factor w/ 7 levels "","de","IA","IR",..: 2 4 4 7 7 5 NA NA 3 2 ...
## $ race.x : num 0 0 0 0 0 0 NA NA 0 0 ...
## $ offset.x : num 6.41 5.82 5.61 4.03 4.77 ...
## $ DOB : num -5049 -7459 -3916 -5890 -6407 ...
## $ sub_dx_age.x : num 46 50 46 44 43 39 NA NA 43 36 ...
## $ refage : num 53 59 51 50 52 45 NA NA 48 44 ...
## $ BMI_age18 : num 20.2 19.7 23.3 19.5 25.8 ...
## $ BMI_dx : num 22.8 20.9 23.3 32.6 25.8 ...
## $ BMI_ref : num 25.7 22 25.8 32.6 28.1 ...
## $ hormone_self_mra.x : num 0 1 1 0 1 0 NA NA 1 0 ...
## $ chemo_self_mra.x : num 1 1 0 0 1 1 NA NA 1 1 ...
## $ treatment.x : num 1 1 1 0 1 1 NA NA 1 1 ...
## $ family_history.x : Factor w/ 3 levels "1+","none","othe": 2 2 2 2 2 2 NA NA 2 2 ...
## $ rh_age_menarche : num 13 14 13 13 12 9 NA NA 12 11 ...
## $ age_menopause_1yrbf_fd: num -1 48 -1 -1 -1 -1 NA NA -1 -1 ...
## $ age_1fftp_fd : num 20 22 0 29 28 0 NA NA 27 24 ...
## $ Num_ftp_fd : num 1 2 -1 2 2 -1 NA NA 3 2 ...
## $ Histology : Factor w/ 4 levels "lobular","medullar",..: 3 3 3 2 3 3 NA NA 3 3 ...
## $ Histology1 : Factor w/ 4 levels "lobular","medullar",..: 3 3 3 2 3 3 NA NA 3 3 ...
## $ Hist_lob_other : Factor w/ 4 levels "lobular","other",..: 2 2 2 2 2 2 NA NA 2 2 ...
## $ stage_fd : num 2 1 1 1 2 2 NA NA 2 2 ...
## $ er_fd : num 1 1 1 4 1 2 NA NA 2 2 ...
## $ pr_fd : num 1 1 1 2 1 2 NA NA 1 0 ...
## $ histo1_cat : Factor w/ 9 levels "al posi",..: 9 3 3 5 3 3 NA NA 3 3 ...
## $ er1_cat : Factor w/ 5 levels "negative","own unkn",..: 3 3 3 5 3 1 NA NA 1 1 ...
## $ pr1_cat : Factor w/ 7 levels "negative","own 0 Ot",..: 3 3 3 1 3 1 NA NA 3 7 ...
## $ status.x : num 0 0 0 0 0 0 NA NA 0 0 ...
## $ status2.x : Factor w/ 4 levels "","0","1","h": 2 2 2 2 2 2 NA NA 2 2 ...
## $ sub_race : num 0 0 0 0 0 0 NA NA 0 0 ...
## $ er1_num : num 1 1 1 NA 1 0 NA NA 0 0 ...
## $ er1 : int 1 1 1 NA 1 0 NA NA 0 0 ...
## $ horm_tmx : num 0 1 1 0 2 0 NA NA 1 0 ...
## $ XRTBreast.x : num 0 0 0 1 0 1 NA NA 1 0 ...
## $ dose_caseloc : num 0 0 0 0.96 NA 0.88 NA NA 0.76 0 ...
## $ good_location.x : num 1 1 0 1 1 1 NA NA 0 0 ...
## $ avedose : num 0 0 0 1.65 NA 1.45 NA NA 0.91 0 ...
## $ tmx : num 0 1 1 0 0 0 NA NA 1 0 ...
## $ num_preg : num 1 1 0 1 1 0 NA NA 1 1 ...
## $ fam_hist : num 0 0 0 0 0 0 NA NA 0 0 ...
## $ age_menarche : int 1 1 1 1 0 0 NA NA 0 0 ...
## $ lobular : num 0 0 0 0 0 0 NA NA 0 0 ...
## $ age_menopause : int 0 2 0 0 0 0 NA NA 0 0 ...
## $ conf_miss : num 0 0 0 0 0 0 NA NA 0 0 ...
## $ dose_num : num 0 0 0 1 NA 1 NA NA 1 0 ...
## $ cmf : Factor w/ 6 levels "","\xb0\x9b@",..: 4 6 5 5 4 4 NA NA 4 4 ...
## $ cmf_012 : num 1 2 0 0 1 1 NA NA 1 1 ...
## $ setno.y : int 382125 204356 360683 226881 357431 374980 201558 201921 213991 385058 ...
## $ cc.y : int 0 0 0 0 0 0 1 1 0 0 ...
## $ chemo : int 1 1 0 0 1 1 1 1 1 1 ...
## $ hormone : int 0 1 1 0 1 0 0 1 1 0 ...
## $ chemo_hormone : Factor w/ 5 levels "","both","chem",..: 3 2 4 5 2 3 3 2 2 3 ...
## $ chemo_self_mra.y : int 1 1 0 0 1 1 1 1 1 1 ...
## $ hormone_self_mra.y : int 0 1 1 0 1 0 0 1 1 0 ...
## $ treatment.y : int 1 1 1 0 1 1 1 1 1 1 ...
## $ ID.y : int 2 6 7 9 11 12 16 22 24 26 ...
## $ labid.y : Factor w/ 498 levels "id200054","id200491",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ status.y : int 0 0 0 0 0 0 1 1 0 0 ...
## $ status2.y : int 0 0 0 0 0 0 1 1 0 0 ...
## $ offset.y : num 6.41 5.82 5.61 4.03 4.77 ...
## $ sub_dx_age.y : int 46 50 46 44 43 39 45 40 43 36 ...
## $ XRTBreast.y : int 0 0 0 1 0 1 0 0 1 0 ...
## $ Eigen_1.y : num -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
## $ Eigen_2.y : num 0.0055 0.00414 0.00302 0.00301 0.0023 ...
## $ Eigen_3.y : num -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
## $ Eigen_4.y : num 0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
## $ Eigen_5.y : num 0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
## $ dose : Factor w/ 3 levels "ge 1Gy","ls 1Gy",..: 3 3 3 2 3 2 3 3 2 3 ...
## $ dsmiss : int 0 0 0 0 1 0 0 0 0 0 ...
## $ good_location.y : int 1 1 0 1 1 1 0 1 0 0 ...
## $ Deleterious : int 0 0 0 0 0 0 0 0 0 0 ...
## $ registry.y : Factor w/ 5 levels "de","IA","IR",..: 1 3 3 5 5 4 3 2 2 1 ...
## $ race.y : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age_stratum1 : Factor w/ 5 levels "20to34","35to39",..: 4 4 4 3 3 2 3 3 3 2 ...
## $ dxyr_stratum : int 2 2 3 1 1 2 1 2 3 2 ...
## $ CMF_Only : int 1 0 0 0 0 1 0 0 1 1 ...
## $ family_history.y : int 0 0 0 0 0 0 1 0 0 0 ...
## $ sub_dx_age_cat : int 0 0 0 1 1 1 0 1 1 1 ...
## $ CMF : Factor w/ 3 levels "CMF","no","Oth": 1 3 2 2 3 1 3 3 1 1 ...
## $ XRTBrCHAR : int 0 0 0 1 0 1 0 0 1 0 ...
demographics.df[1:5,1:5]
## Subject_ID ID.x labid.x Eigen_1.x Eigen_2.x
## 1 200054 2 id200054 -0.007897666 0.005498053
## 2 200491 6 id200491 -0.010615326 0.004141289
## 3 200565 7 id200565 -0.005393813 0.003017045
## 4 200698 9 id200698 -0.009061714 0.003010147
## 5 200958 11 id200958 -0.010936549 0.002295868
dim(samples.df)
## [1] 512 4
str(samples.df)
## 'data.frame': 512 obs. of 4 variables:
## $ wes_id : Factor w/ 512 levels "P1_A01","P1_A02",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ gwas_id : Factor w/ 510 levels "id200054","id200491",..: 14 405 315 264 67 121 326 251 281 141 ...
## $ merged_id: Factor w/ 512 levels "P1_A01_id203568",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ filter : Factor w/ 5 levels "duplicate","low_concordance",..: 5 5 5 5 5 5 5 5 5 5 ...
samples.df[1:5,]
## wes_id gwas_id merged_id filter
## 1 P1_A01 id203568 P1_A01_id203568 pass
## 2 P1_A02 id357807 P1_A02_id357807 pass
## 3 P1_A03 id325472 P1_A03_id325472 pass
## 4 P1_A04 id304354 P1_A04_id304354 pass
## 5 P1_A05 id222648 P1_A05_id222648 pass
dim(vv.df)
## [1] 19798 49
str(vv.df)
## 'data.frame': 19798 obs. of 49 variables:
## $ MultiAllelic : logi NA NA NA NA NA NA ...
## $ NDA : int 1 1 1 2 1 1 1 1 1 1 ...
## $ TYPE : Factor w/ 2 levels "INDEL","SNP": 2 2 2 1 2 2 2 2 2 2 ...
## $ CHROM : Factor w/ 25 levels "1","10","11",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ POS : int 881871 883883 883946 892487 897287 897792 979748 987173 989899 1132513 ...
## $ REF : Factor w/ 9189 levels "A","AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAATAAATAAAAT",..: 4792 6632 2878 2879 2878 2878 1 6632 2878 2878 ...
## $ ALT : Factor w/ 4394 levels "A","AAAAAAAAAAAAAAAC",..: 1 1125 3321 1125 3321 3321 3321 2230 3321 3321 ...
## $ QUAL : num 268 333 429 1770 321 ...
## $ DP : int 14713 15345 17601 15782 13649 12434 11763 18698 16424 17543 ...
## $ VQSLOD : num 18.33 15.58 14.87 2.79 17.59 ...
## $ FILTER : Factor w/ 46 levels "INDEL_DP_LESS_THAN_5120.0",..: 35 35 35 35 35 35 35 35 35 35 ...
## $ AC : int 1 2 1 2 1 1 36 1 1 2 ...
## $ AF : num 0.000978 0.001953 0.000977 0.001953 0.000982 ...
## $ AN : int 1022 1024 1024 1024 1018 1022 1020 1024 1024 1024 ...
## $ NEGATIVE_TRAIN_SITE : Factor w/ 1 level "true": NA NA NA NA NA NA NA NA NA NA ...
## $ POSITIVE_TRAIN_SITE : Factor w/ 1 level "true": NA NA NA NA NA NA 1 NA NA NA ...
## $ ALT_frequency_in_1k_90 : Factor w/ 1 level "true": NA NA NA NA NA NA NA NA NA NA ...
## $ ALT_frequency_in_1k_95 : Factor w/ 1 level "true": NA NA NA NA NA NA NA NA NA NA ...
## $ ALT_frequency_in_1k_99 : Factor w/ 1 level "true": NA NA NA NA NA NA NA NA NA NA ...
## $ ALT_frequency_in_1k_100: Factor w/ 1 level "true": NA NA NA NA NA NA NA NA NA NA ...
## $ SYMBOL : Factor w/ 21214 levels "","A1BG","A1CF",..: 11872 11872 11872 11872 9227 9227 617 617 617 19515 ...
## $ Allele : Factor w/ 3662 levels "-","A","AA","AAA",..: 2 839 2726 1 2726 2726 2726 1798 2726 2726 ...
## $ ALLELE_NUM : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Consequence : Factor w/ 95 levels "3_prime_UTR_variant",..: 78 27 27 9 27 78 27 27 27 27 ...
## $ IMPACT : Factor w/ 4 levels "HIGH","LOW","MODERATE",..: 1 3 3 1 3 1 3 3 3 3 ...
## $ CLIN_SIG : Factor w/ 95 levels "","association",..: 1 1 1 1 1 1 3 1 1 1 ...
## $ SIFT : Factor w/ 205 levels "","deleterious(0)",..: 1 2 3 1 6 1 3 2 4 4 ...
## $ PolyPhen : Factor w/ 1003 levels "","benign(0)",..: 1 938 948 1 999 1 934 1001 993 1001 ...
## $ Existing_variation : Factor w/ 563471 levels "","1KG_12_3747324",..: 1 98520 380067 527722 521349 484071 25218 471172 124473 523764 ...
## $ GMAF : Factor w/ 16424 levels "","-:0.0000",..: 1 5980 12791 1 1 1 12918 1 1 1 ...
## $ AFR_MAF : Factor w/ 10312 levels "","-:0","-:0&-:0",..: 1 3689 7935 1 1 1 7936 1 1 1 ...
## $ ASN_MAF : logi NA NA NA NA NA NA ...
## $ EUR_MAF : Factor w/ 8788 levels "","-:0","-:0&-:0",..: 1 3132 6730 1 1 1 6828 1 1 1 ...
## $ EAS_MAF : Factor w/ 8404 levels "","-:0","-:0&-:0",..: 1 3011 6463 1 1 1 6477 1 1 1 ...
## $ SAS_MAF : Factor w/ 8543 levels "","-:0","-:0&-:0",..: 1 3033 6541 1 1 1 6591 1 1 1 ...
## $ AA_MAF : Factor w/ 32797 levels "","-:0","-:0&-:0",..: 1 9743 1 2 1 1 25621 1 25480 1 ...
## $ EA_MAF : Factor w/ 33625 levels "","-:0","-:0&-:0",..: 1 10068 1 9 1 1 26573 1 26019 1 ...
## $ cDNA_position : Factor w/ 23204 levels "","0-1","1","?-1",..: 5983 5178 4817 12987 18606 21591 7920 16660 17244 5843 ...
## $ CDS_position : Factor w/ 20550 levels "","1","?-1","10",..: 5132 4361 4062 10114 14641 17700 6813 14511 15052 4440 ...
## $ Codons : Factor w/ 6876 levels "","-/A","-/AA",..: 2215 4055 2887 4121 2561 2875 4074 5612 823 743 ...
## $ Protein_position : Factor w/ 9321 levels "","1","?-1","10",..: 7541 7149 6999 441 2379 3622 8316 2321 2536 7193 ...
## $ Amino_acids : Factor w/ 4479 levels "","-/*","*","A",..: 2862 791 3133 627 2808 3076 906 4370 3875 3827 ...
## $ DISTANCE : int NA NA NA NA NA NA NA NA NA NA ...
## $ STRAND : int -1 -1 -1 -1 1 1 1 1 1 1 ...
## $ SYMBOL_SOURCE : Factor w/ 7 levels "","Clone_based_ensembl_gene",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ SIFT_Call : Factor w/ 4 levels "deleterious",..: NA 1 1 NA 1 NA 1 1 1 1 ...
## $ SIFT_Score : num NA 0 0.01 NA 0.04 NA 0.01 0 0.02 0.02 ...
## $ PolyPhen_Call : Factor w/ 4 levels "benign","possibly_damaging",..: NA 3 3 NA 3 NA 3 3 3 3 ...
## $ PolyPhen_Score : num NA 0.936 0.946 NA 0.997 NA 0.932 0.999 0.991 0.999 ...
vv.df[1:5,1:5]
## MultiAllelic NDA TYPE CHROM POS
## var000000170 NA 1 SNP 1 881871
## var000000182 NA 1 SNP 1 883883
## var000000184 NA 1 SNP 1 883946
## var000000217 NA 2 INDEL 1 892487
## var000000239 NA 1 SNP 1 897287
dim(gt.mx)
## [1] 19798 512
class(gt.mx)
## [1] "matrix"
gt.mx[10:25,1:5]
## P1_A01.GT P1_A02.GT P1_A03.GT P1_A04.GT P1_A05.GT
## var000000764 0 0 0 0 0
## var000001089 0 NA NA 0 0
## var000001126 0 0 0 0 0
## var000001160 0 0 0 0 0
## var000001342 0 0 0 0 0
## var000001434 0 NA NA 0 0
## var000001441 0 0 0 1 0
## var000001462 0 0 0 0 0
## var000001507 0 0 0 0 0
## var000001923 0 0 0 0 NA
## var000001963 0 0 0 0 0
## var000002278 0 NA 0 0 0
## var000002279 2 NA 1 1 1
## var000002294 2 0 1 1 1
## var000002426 0 0 0 0 0
## var000002458 0 0 0 0 0
# Check consistence of rownames
sum(rownames(gt.mx) != rownames(vv.df), na.rm=TRUE)
## [1] 0
# select relevant variants annotations
variants.df <-
vv.df %>%
select(SYMBOL, TYPE, CHROM, POS, REF, ALT, AC, AF, AN,
Consequence, SIFT, PolyPhen, CLIN_SIG,
GMAF, EUR_MAF, ALT_frequency_in_1k_95)
rm(vv.df)
# Explore cases with confusing GMAF/EUR_MAF containing & symbol
nrow(variants.df %>% filter(grepl("&",GMAF)))
## [1] 0
nrow(variants.df %>% filter(grepl("&",EUR_MAF)))
## [1] 97
variants.df %>%
filter(grepl("&",EUR_MAF)) %>%
select(c(1:6), GMAF, EUR_MAF) %>%
top_n(10,EUR_MAF)
## SYMBOL TYPE CHROM POS REF ALT GMAF EUR_MAF
## 1 RPE65 SNP 1 68910315 C T T:0.0034 T:0&T:0
## 2 ABCA4 SNP 1 94485278 C T T:0.0004 T:0&T:0
## 3 CAV3 SNP 3 8775661 C T T:0.3710 T:0.2684&T:0.2684
## 4 CASR SNP 3 122003757 G T T:0.0942 T:0.1451&T:0.1451
## 5 KDR SNP 4 55979558 C T T:0.1526 T:0.0944&T:0.0944
## 6 POMT1 SNP 9 134385435 C T T:0.0787 T:0.1461&T:0.1461
## 7 KRT75 SNP 12 52827608 C T T:0.1432 T:0.1213&T:0.1213
## 8 RPGRIP1 SNP 14 21790040 G T T:0.1673 T:0.2366&T:0.2366
## 9 TNFRSF13B SNP 17 16843666 C T T:0.0002 T:0&T:0
## 10 C1GALT1C1 SNP X 119760629 A T T:0.1475 T:0.1809&T:0.1809
# There are even 3 variants with triplicated values separated by &
variants.df[c(1235, 11405, 15894),c("GMAF", "EUR_MAF")]
## GMAF EUR_MAF
## var000050038 T:0.0004 T:0.001&T:0.001&T:0.001
## var000449558 A:0.0002 A:0&A:0&A:0
## var000628093 A:0.0008 A:0.002&A:0.002&A:0.002
# Split cases with confusing EUR_MAF containing "&"" symbol
variants.df <-
variants.df %>%
separate(EUR_MAF, c("EUR_MAF1", "EUR_MAF2"), "&")
## Warning: Too many values at 3 locations: 1235, 11405, 15894
## Warning: Too few values at 19701 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
# All duplicated values are the same
sum(variants.df$EUR_MAF1 != variants.df$EUR_MAF2, na.rm=TRUE)
## [1] 0
# Remove one of the duplicates
variants.df <-
variants.df %>%
select(-EUR_MAF2)
# split GMAF and EUR_MAF1
variants.df <-
variants.df %>%
separate(GMAF, c("GMAF_Allele", "GMAF_Fraction"), ":") %>%
separate(EUR_MAF1, c("EUR_MAF_Allele", "EUR_MAF_Fraction"), ":")
## Warning: Too few values at 11392 locations: 1, 4, 5, 6, 8, 9, 10, 12, 19,
## 20, 25, 26, 29, 30, 31, 32, 33, 35, 36, 37, ...
## Warning: Too few values at 11390 locations: 1, 4, 5, 6, 8, 9, 10, 12, 19,
## 20, 25, 26, 29, 30, 31, 32, 33, 35, 36, 37, ...
# Convert columns to numeric etc
as.numeric(variants.df$GMAF_Fraction) -> variants.df$GMAF_Fraction
as.numeric(variants.df$EUR_MAF_Fraction) -> variants.df$EUR_MAF_Fraction
# Change "" to NAs
NA -> variants.df$GMAF_Allele[variants.df$GMAF_Allele == ""]
NA -> variants.df$EUR_MAF_Allele[variants.df$EUR_MAF_Allele == ""]
NA -> variants.df$SIFT[variants.df$SIFT == ""]
NA -> variants.df$PolyPhen[variants.df$PolyPhen == ""]
NA -> variants.df$CLIN_SIG[variants.df$CLIN_SIG == ""]
# Convert chars to factors
as.factor(variants.df$GMAF_Allele) -> variants.df$GMAF_Allele
as.factor(variants.df$EUR_MAF_Allele) -> variants.df$EUR_MAF_Allele
# Change "true" to TRUE
sum(variants.df$ALT_frequency_in_1k_95 == "true", na.rm = TRUE)
## [1] 39
variants.df <-
variants.df %>%
mutate(frequent_in_1k=as.logical(
str_replace_all(ALT_frequency_in_1k_95, "true", TRUE))) %>%
select(-ALT_frequency_in_1k_95)
sum(variants.df$frequent_in_1k, na.rm=TRUE)
## [1] 39
# Split SIFT
variants.df <-
variants.df %>%
mutate(SIFT_call=sub("\\(.*\\)","",SIFT)) %>%
mutate(SIFT_score=as.numeric(
sub(".*\\(","", sub("\\)","",SIFT)))) %>%
select(-SIFT)
# Split PolyPhen
variants.df <-
variants.df %>%
mutate(PolyPhen_call=sub("\\(.*\\)","",PolyPhen)) %>%
mutate(PolyPhen_score=as.numeric(
sub(".*\\(","", sub("\\)","",PolyPhen)))) %>%
select(-PolyPhen)
# Check variants table
dim(variants.df)
## [1] 19798 20
str(variants.df)
## 'data.frame': 19798 obs. of 20 variables:
## $ SYMBOL : Factor w/ 21214 levels "","A1BG","A1CF",..: 11872 11872 11872 11872 9227 9227 617 617 617 19515 ...
## $ TYPE : Factor w/ 2 levels "INDEL","SNP": 2 2 2 1 2 2 2 2 2 2 ...
## $ CHROM : Factor w/ 25 levels "1","10","11",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ POS : int 881871 883883 883946 892487 897287 897792 979748 987173 989899 1132513 ...
## $ REF : Factor w/ 9189 levels "A","AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAATAAATAAAAT",..: 4792 6632 2878 2879 2878 2878 1 6632 2878 2878 ...
## $ ALT : Factor w/ 4394 levels "A","AAAAAAAAAAAAAAAC",..: 1 1125 3321 1125 3321 3321 3321 2230 3321 3321 ...
## $ AC : int 1 2 1 2 1 1 36 1 1 2 ...
## $ AF : num 0.000978 0.001953 0.000977 0.001953 0.000982 ...
## $ AN : int 1022 1024 1024 1024 1018 1022 1020 1024 1024 1024 ...
## $ Consequence : Factor w/ 95 levels "3_prime_UTR_variant",..: 78 27 27 9 27 78 27 27 27 27 ...
## $ CLIN_SIG : Factor w/ 95 levels "","association",..: NA NA NA NA NA NA 3 NA NA NA ...
## $ GMAF_Allele : Factor w/ 46 levels "-","A","AA","AAAT",..: NA 15 36 NA NA NA 36 NA NA NA ...
## $ GMAF_Fraction : num NA 0.0002 0.0002 NA NA NA 0.0184 NA NA NA ...
## $ EUR_MAF_Allele : Factor w/ 47 levels "-","A","AA","AAAT",..: NA 15 36 NA NA NA 36 NA NA NA ...
## $ EUR_MAF_Fraction: num NA 0 0 NA NA NA 0.0467 NA NA NA ...
## $ frequent_in_1k : logi NA NA NA NA NA NA ...
## $ SIFT_call : chr NA "deleterious" "deleterious" NA ...
## $ SIFT_score : num NA 0 0.01 NA 0.04 NA 0.01 0 0.02 0.02 ...
## $ PolyPhen_call : chr NA "probably_damaging" "probably_damaging" NA ...
## $ PolyPhen_score : num NA 0.936 0.946 NA 0.997 NA 0.932 0.999 0.991 0.999 ...
variants.df[1:5,1:5]
## SYMBOL TYPE CHROM POS REF
## 1 NOC2L SNP 1 881871 G
## 2 NOC2L SNP 1 883883 T
## 3 NOC2L SNP 1 883946 C
## 4 NOC2L INDEL 1 892487 CA
## 5 KLHL17 SNP 1 897287 C
dim(covar.df)
## [1] 498 34
colnames(covar.df)
## [1] "Subject_ID" "setno" "cc"
## [4] "chemo" "hormone" "chemo_hormone"
## [7] "chemo_self_mra" "hormone_self_mra" "treatment"
## [10] "ID" "labid" "status"
## [13] "status2" "offset" "sub_dx_age"
## [16] "XRTBreast" "Eigen_1" "Eigen_2"
## [19] "Eigen_3" "Eigen_4" "Eigen_5"
## [22] "dose" "dsmiss" "good_location"
## [25] "Deleterious" "registry" "race"
## [28] "age_stratum1" "dxyr_stratum" "CMF_Only"
## [31] "family_history" "sub_dx_age_cat" "CMF"
## [34] "XRTBrCHAR"
attach(covar.df)
# chemo is the same as chemo_self_mra, no missed values
sum(chemo != chemo_self_mra)
## [1] 0
# One missed case in hormone/ hormone_self_mra
sum(is.na(hormone))
## [1] 1
sum(is.na(hormone_self_mra))
## [1] 1
# hormone has 10 differences from hormone_self_mra
sum(hormone != hormone_self_mra, na.rm=TRUE)
## [1] 10
# status is the same as status2, no missed values
sum(status != status2)
## [1] 0
# Race is always 0, no missed data
sum(race)
## [1] 0
# Dismiss split 495 vs 3, no missed data
sum(dsmiss)
## [1] 3
# good_location split 86 vs 412, no missed data
sum(good_location)
## [1] 412
# Deleterious split 480 vs 18, no missed data
sum(Deleterious)
## [1] 18
# Deleterious split 158 vs 340, no missed data
sum(family_history)
## [1] 158
# XRTBrCHAR is the same as XRTBreast, no missed values
sum(XRTBrCHAR != XRTBreast)
## [1] 0
# sub_dx_age_cat split 169 vs 329, no missed data
sum(sub_dx_age_cat)
## [1] 329
# dose
summary(dose)
## ge 1Gy ls 1Gy no dose
## 107 153 238
detach(covar.df)
# Keep only selected annotations
selected_annotations <- c(
"labid","cc","sub_dx_age","offset",
"chemo","hormone","XRTBreast","dose",
"Eigen_1","Eigen_2","Eigen_3","Eigen_4","Eigen_5")
# Keep only selected annotations
covar.df <- covar.df[,selected_annotations]
rm(selected_annotations)
# change XRTBreast to xrt
"xrt" -> colnames(covar.df)[7]
# Check result
dim(covar.df)
## [1] 498 13
str(covar.df)
## 'data.frame': 498 obs. of 13 variables:
## $ labid : Factor w/ 498 levels "id200054","id200491",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ cc : int 0 0 0 0 0 0 1 1 0 0 ...
## $ sub_dx_age: int 46 50 46 44 43 39 45 40 43 36 ...
## $ offset : num 6.41 5.82 5.61 4.03 4.77 ...
## $ chemo : int 1 1 0 0 1 1 1 1 1 1 ...
## $ hormone : int 0 1 1 0 1 0 0 1 1 0 ...
## $ xrt : int 0 0 0 1 0 1 0 0 1 0 ...
## $ dose : Factor w/ 3 levels "ge 1Gy","ls 1Gy",..: 3 3 3 2 3 2 3 3 2 3 ...
## $ Eigen_1 : num -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
## $ Eigen_2 : num 0.0055 0.00414 0.00302 0.00301 0.0023 ...
## $ Eigen_3 : num -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
## $ Eigen_4 : num 0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
## $ Eigen_5 : num 0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
covar.df[1:5,1:5]
## labid cc sub_dx_age offset chemo
## 1 id200054 0 46 6.406880 1
## 2 id200491 0 50 5.820083 1
## 3 id200565 0 46 5.609472 0
## 4 id200698 0 44 4.034241 0
## 5 id200958 0 43 4.770685 1
# Outlook
dim(demographics.df)
## [1] 505 91
colnames(demographics.df)
## [1] "Subject_ID" "ID.x"
## [3] "labid.x" "Eigen_1.x"
## [5] "Eigen_2.x" "Eigen_3.x"
## [7] "Eigen_4.x" "Eigen_5.x"
## [9] "Phase" "setno.x"
## [11] "cc.x" "rstime"
## [13] "registry.x" "race.x"
## [15] "offset.x" "DOB"
## [17] "sub_dx_age.x" "refage"
## [19] "BMI_age18" "BMI_dx"
## [21] "BMI_ref" "hormone_self_mra.x"
## [23] "chemo_self_mra.x" "treatment.x"
## [25] "family_history.x" "rh_age_menarche"
## [27] "age_menopause_1yrbf_fd" "age_1fftp_fd"
## [29] "Num_ftp_fd" "Histology"
## [31] "Histology1" "Hist_lob_other"
## [33] "stage_fd" "er_fd"
## [35] "pr_fd" "histo1_cat"
## [37] "er1_cat" "pr1_cat"
## [39] "status.x" "status2.x"
## [41] "sub_race" "er1_num"
## [43] "er1" "horm_tmx"
## [45] "XRTBreast.x" "dose_caseloc"
## [47] "good_location.x" "avedose"
## [49] "tmx" "num_preg"
## [51] "fam_hist" "age_menarche"
## [53] "lobular" "age_menopause"
## [55] "conf_miss" "dose_num"
## [57] "cmf" "cmf_012"
## [59] "setno.y" "cc.y"
## [61] "chemo" "hormone"
## [63] "chemo_hormone" "chemo_self_mra.y"
## [65] "hormone_self_mra.y" "treatment.y"
## [67] "ID.y" "labid.y"
## [69] "status.y" "status2.y"
## [71] "offset.y" "sub_dx_age.y"
## [73] "XRTBreast.y" "Eigen_1.y"
## [75] "Eigen_2.y" "Eigen_3.y"
## [77] "Eigen_4.y" "Eigen_5.y"
## [79] "dose" "dsmiss"
## [81] "good_location.y" "Deleterious"
## [83] "registry.y" "race.y"
## [85] "age_stratum1" "dxyr_stratum"
## [87] "CMF_Only" "family_history.y"
## [89] "sub_dx_age_cat" "CMF"
## [91] "XRTBrCHAR"
demographics.df[1:10,1:5]
## Subject_ID ID.x labid.x Eigen_1.x Eigen_2.x
## 1 200054 2 id200054 -0.007897666 0.005498053
## 2 200491 6 id200491 -0.010615326 0.004141289
## 3 200565 7 id200565 -0.005393813 0.003017045
## 4 200698 9 id200698 -0.009061714 0.003010147
## 5 200958 11 id200958 -0.010936549 0.002295868
## 6 201046 12 id201046 -0.010397451 0.004503799
## 7 201558 NA <NA> NA NA
## 8 201921 NA <NA> NA NA
## 9 202026 24 id202026 -0.011427344 0.003990090
## 10 202236 26 id202236 -0.007524541 -0.001414092
# Exclude odd cases
odd_cases=c("201558","201921","202698","21IAno","387460",
"389192","58IR1+","60IA1+","92IRno","98IAno")
demographics.df %>%
filter(Subject_ID %in% odd_cases) %>%
select(1:5)
## Subject_ID ID.x labid.x Eigen_1.x Eigen_2.x
## 1 201558 NA <NA> NA NA
## 2 201921 NA <NA> NA NA
## 3 202698 NA <NA> NA NA
## 4 21IAno 1.164234e-25 19212019 -2.400941e-267 -1.400366e-103
## 5 387460 NA <NA> NA NA
## 6 389192 NA <NA> NA NA
## 7 58IR1+ -9.238075e+11 15582015 6.560630e-155 -6.845025e-292
## 8 60IA1+ -7.255563e-207 74603874 7.125925e-196 -1.737869e+90
## 9 92IRno 3.394376e+132 91923891 -2.236906e+99 8.169427e-68
## 10 98IAno 7.355810e+306 26982026 7.797358e+36 -1.779783e-65
pheno.df <- demographics.df %>% filter(! Subject_ID %in% odd_cases)
dim(pheno.df)
## [1] 495 91
pheno.df[1:10,1:5]
## Subject_ID ID.x labid.x Eigen_1.x Eigen_2.x
## 1 200054 2 id200054 -0.007897666 0.005498053
## 2 200491 6 id200491 -0.010615326 0.004141289
## 3 200565 7 id200565 -0.005393813 0.003017045
## 4 200698 9 id200698 -0.009061714 0.003010147
## 5 200958 11 id200958 -0.010936549 0.002295868
## 6 201046 12 id201046 -0.010397451 0.004503799
## 7 202026 24 id202026 -0.011427344 0.003990090
## 8 202236 26 id202236 -0.007524541 -0.001414092
## 9 202718 31 id202718 -0.012475710 0.005576766
## 10 203186 36 id203186 -0.008093648 0.012604182
duplicated_cases <-
pheno.df %>%
filter(duplicated(Subject_ID)) %>%
select(Subject_ID)
duplicated_cases <- as.vector(duplicated_cases[,1])
duplicated_cases
## [1] "259643" "272715"
# Phenotype annotations for the duplicate record are identical
pheno.df %>% filter(Subject_ID %in% duplicated_cases)
## Subject_ID ID.x labid.x Eigen_1.x Eigen_2.x Eigen_3.x
## 1 259643 574 id259643 -0.009376837 0.002435931 -0.008098524
## 2 259643 574 id259643 -0.009376837 0.002435931 -0.008098524
## 3 272715 664 id272715 -0.013179119 0.008036325 0.002829476
## 4 272715 664 id272715 -0.013179119 0.008036325 0.002829476
## Eigen_4.x Eigen_5.x Phase setno.x cc.x rstime registry.x race.x
## 1 -0.012619313 0.006557315 1 259643 1 3.33 IA 0
## 2 -0.012619313 0.006557315 1 259643 1 3.33 IA 0
## 3 0.004044336 -0.013210921 1 259643 0 3.33 IA 0
## 4 0.004044336 -0.013210921 1 259643 0 3.33 IA 0
## offset.x DOB sub_dx_age.x refage BMI_age18 BMI_dx BMI_ref
## 1 5.384495 -6547 47 50 18.88158 22.31460 22.31460
## 2 5.384495 -6547 47 50 18.88158 22.31460 22.31460
## 3 3.417727 -7883 48 50 23.43605 29.29506 29.29506
## 4 3.417727 -7883 48 50 23.43605 29.29506 29.29506
## hormone_self_mra.x chemo_self_mra.x treatment.x family_history.x
## 1 1 0 1 none
## 2 1 0 1 none
## 3 0 0 0 1+
## 4 0 0 0 1+
## rh_age_menarche age_menopause_1yrbf_fd age_1fftp_fd Num_ftp_fd Histology
## 1 13 -1 19 2 other
## 2 13 -1 19 2 other
## 3 12 -1 25 2 other
## 4 12 -1 25 2 other
## Histology1 Hist_lob_other stage_fd er_fd pr_fd histo1_cat er1_cat
## 1 other other 1 1 1 ductal positive
## 2 other other 1 1 1 ductal positive
## 3 other other 1 2 2 ductal negative
## 4 other other 1 2 2 ductal negative
## pr1_cat status.x status2.x sub_race er1_num er1 horm_tmx XRTBreast.x
## 1 positive 1 1 0 1 1 1 0
## 2 positive 1 1 0 1 1 1 0
## 3 negative 0 0 0 0 0 0 1
## 4 negative 0 0 0 0 0 0 1
## dose_caseloc good_location.x avedose tmx num_preg fam_hist age_menarche
## 1 0.0 1 0.00 1 1 0 1
## 2 0.0 1 0.00 1 1 0 1
## 3 1.4 1 2.22 0 1 1 0
## 4 1.4 1 2.22 0 1 1 0
## lobular age_menopause conf_miss dose_num cmf cmf_012 setno.y cc.y chemo
## 1 0 0 0 0 no 0 259643 1 0
## 2 0 0 0 0 no 0 259643 1 0
## 3 0 0 0 2 no 0 259643 0 0
## 4 0 0 0 2 no 0 259643 0 0
## hormone chemo_hormone chemo_self_mra.y hormone_self_mra.y treatment.y
## 1 1 horm 0 1 1
## 2 1 horm 0 1 1
## 3 0 none 0 0 0
## 4 0 none 0 0 0
## ID.y labid.y status.y status2.y offset.y sub_dx_age.y XRTBreast.y
## 1 574 id259643 1 1 5.384495 47 0
## 2 574 id259643 1 1 5.384495 47 0
## 3 664 id272715 0 0 3.417727 48 1
## 4 664 id272715 0 0 3.417727 48 1
## Eigen_1.y Eigen_2.y Eigen_3.y Eigen_4.y Eigen_5.y dose
## 1 -0.009376837 0.002435931 -0.008098524 -0.012619313 0.006557315 no dose
## 2 -0.009376837 0.002435931 -0.008098524 -0.012619313 0.006557315 no dose
## 3 -0.013179119 0.008036325 0.002829476 0.004044336 -0.013210921 ge 1Gy
## 4 -0.013179119 0.008036325 0.002829476 0.004044336 -0.013210921 ge 1Gy
## dsmiss good_location.y Deleterious registry.y race.y age_stratum1
## 1 0 1 0 IA 0 45to49
## 2 0 1 0 IA 0 45to49
## 3 0 1 0 IA 0 45to49
## 4 0 1 0 IA 0 45to49
## dxyr_stratum CMF_Only family_history.y sub_dx_age_cat CMF XRTBrCHAR
## 1 1 0 0 0 no 0
## 2 1 0 0 0 no 0
## 3 1 0 1 0 no 1
## 4 1 0 1 0 no 1
pheno.df <- pheno.df %>% filter(!duplicated(Subject_ID))
dim(pheno.df)
## [1] 493 91
pheno.df[1:10,1:5]
## Subject_ID ID.x labid.x Eigen_1.x Eigen_2.x
## 1 200054 2 id200054 -0.007897666 0.005498053
## 2 200491 6 id200491 -0.010615326 0.004141289
## 3 200565 7 id200565 -0.005393813 0.003017045
## 4 200698 9 id200698 -0.009061714 0.003010147
## 5 200958 11 id200958 -0.010936549 0.002295868
## 6 201046 12 id201046 -0.010397451 0.004503799
## 7 202026 24 id202026 -0.011427344 0.003990090
## 8 202236 26 id202236 -0.007524541 -0.001414092
## 9 202718 31 id202718 -0.012475710 0.005576766
## 10 203186 36 id203186 -0.008093648 0.012604182
attach(pheno.df)
# Case-control status
ID.x[1:5]
## [1] 2 6 7 9 11
sum(ID.x != ID.y)
## [1] 0
cc.x[1:20]
## [1] 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 1 0 0
sum(cc.x != cc.y)
## [1] 0
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(cc.x)
## cc.x
## 0 1
## 247 246
# Lables x vs y
labid.x[1:5]
## [1] id200054 id200491 id200565 id200698 id200958
## 498 Levels: 15582015 19212019 26982026 74603874 91923891 ... id399943
sum(as.vector(labid.x) != as.vector(labid.y))
## [1] 0
# Familial history
fam_hist[1:5]
## [1] 0 0 0 0 0
sum(fam_hist != family_history.y)
## [1] 0
sum(as.numeric(sub("none",0,sub("1\\+",1,family_history.x))) != family_history.y)
## [1] 0
ggplot(pheno.df) +
aes(as.factor(family_history.x), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(family_history.x)
## family_history.x
## 1+ none othe
## 156 337 0
table(family_history.y)
## family_history.y
## 0 1
## 337 156
# No disbalance in familial history between cases and controls
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(family_history.x)) +
geom_bar(colour="black", alpha=0.3)
table(cc.y, family_history.y)
## family_history.y
## cc.y 0 1
## 0 178 69
## 1 159 87
fisher.test(table(cc.y, family_history.y))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.y, family_history.y)
## p-value = 0.08182
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9469101 2.1065143
## sample estimates:
## odds ratio
## 1.410529
# 1st Ds age (?)
sub_dx_age.x[1:5]
## [1] 46 50 46 44 43
sum(sub_dx_age.x != sub_dx_age.y)
## [1] 0
ggplot(pheno.df) +
aes(sub_dx_age.x, fill=as.factor(cc.x)) +
geom_histogram(colour="black", alpha=0.3, binwidth = 1)
# Consistent with 55 yr selection criteria
# No differences between cases and controls by age
t.test(sub_dx_age.x~cc.x)
##
## Welch Two Sample t-test
##
## data: sub_dx_age.x by cc.x
## t = -0.29954, df = 490.96, p-value = 0.7647
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.0209136 0.7508106
## sample estimates:
## mean in group 0 mean in group 1
## 42.22267 42.35772
ggplot(pheno.df) +
aes(sub_dx_age.x, fill=as.factor(cc.x)) +
geom_density(colour="black", alpha=0.3)
# Compare to Refage (age at 2nd ds???)
ggplot(pheno.df, colour="black") +
geom_density(aes(refage), fill="red", alpha=0.3) +
geom_density(aes(sub_dx_age.x), fill="blue", alpha=0.3)
# No differences between cases and controls by refage
t.test(refage~cc.x)
##
## Welch Two Sample t-test
##
## data: refage by cc.x
## t = -0.038983, df = 490.73, p-value = 0.9689
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.211407 1.164273
## sample estimates:
## mean in group 0 mean in group 1
## 47.79757 47.82114
ggplot(pheno.df) +
aes(refage, fill=as.factor(cc.x)) +
geom_density(colour="black", alpha=0.3)
# Categorical age codings
dxyr_stratum[1:5]
## [1] 2 2 3 1 1
age_stratum1[1:5]
## [1] 45to49 45to49 45to49 40to44 40to44
## Levels: 20to34 35to39 40to44 45to49 50to54
sub_dx_age_cat[1:5]
## [1] 0 0 0 1 1
# Unknown offset
# consistent with 1+ year between the 1st and 2nd tumours (selection criteria)
offset.x[1:5]
## [1] 6.406880 5.820083 5.609472 4.034241 4.770685
sum(offset.x != offset.y)
## [1] 0
hist(offset.x)
# Compare to rstime (~age at ds - age at 2nd ds???)
ggplot(pheno.df, colour="black") +
geom_density(aes(offset.x), fill="red", alpha=0.3) +
geom_density(aes(rstime), fill="blue", alpha=0.3)
min(offset.x)
## [1] 1.252763
min(rstime)
## [1] 1
# Any treatment
treatment.x[1:5]
## [1] 1 1 1 0 1
treatment.y[1:5]
## [1] 1 1 1 0 1
sum(treatment.x != treatment.y)
## [1] 0
ggplot(pheno.df) +
aes(as.factor(treatment.x), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
# There was more treatment in controls
table(cc.x, treatment.x)
## treatment.x
## cc.x 0 1
## 0 89 158
## 1 112 134
fisher.test(table(cc.x, treatment.x))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, treatment.x)
## p-value = 0.03518
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.4619463 0.9827664
## sample estimates:
## odds ratio
## 0.6744954
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(treatment.x)) +
geom_bar(colour="black", alpha=0.3)
# for comparison (neither will be used later)
chemo_hormone[1:5]
## [1] chem both horm none both
## Levels: both chem horm none
# Some missed data and discrepansies in endocrine treatment
# (when compare hormone, hormone_self_mra.x and hormone_self_mra.y)
hormone[1:5]
## [1] 0 1 1 0 1
hormone_self_mra.x[1:5]
## [1] 0 1 1 0 1
hormone_self_mra.y[1:5]
## [1] 0 1 1 0 1
sum(hormone_self_mra.x != hormone_self_mra.y, na.rm=TRUE)
## [1] 0
sum(hormone != hormone_self_mra.x, na.rm=TRUE)
## [1] 10
sum(is.na(hormone))
## [1] 1
sum(is.na(hormone_self_mra.x))
## [1] 0
sum(is.na(hormone_self_mra.y))
## [1] 1
ggplot(pheno.df) +
aes(as.factor(hormone), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
# There was no disbalance in cases and controls by hormonal treatment
table(cc.x,hormone)
## hormone
## cc.x 0 1
## 0 188 58
## 1 196 50
fisher.test(table(cc.x, hormone))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, hormone)
## p-value = 0.4459
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.5261579 1.2971381
## sample estimates:
## odds ratio
## 0.8272048
pheno.df %>%
filter(!is.na(hormone)) %>%
ggplot() +
aes(as.factor(cc.x), fill=as.factor(hormone)) +
geom_bar(colour="black", alpha=0.3)
# Hormone vs er:
# 64% er+ves were not given hormones:
# old cohort, young patients, cytotoxic instead ...?
# 11 er-ves were hiven hormones??
table(er1, hormone)
## hormone
## er1 0 1
## 0 101 11
## 1 148 82
sum(is.na(er1))
## [1] 150
# No missed data or discrepansies in cytotoxic treatment
# (when compare chemo, chemo_self_mra.x and chemo_self_mra.y)
chemo[1:5]
## [1] 1 1 0 0 1
sum(chemo_self_mra.x != chemo_self_mra.y)
## [1] 0
sum(chemo != chemo_self_mra.x)
## [1] 0
# Unbalanced cases and controls in treated/non-treated by cytotoxics
ggplot(pheno.df) +
aes(as.factor(chemo), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
# There was more chemo in controls
table(cc.x, chemo)
## chemo
## cc.x 0 1
## 0 108 139
## 1 136 110
fisher.test(table(cc.x, chemo))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, chemo)
## p-value = 0.01167
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.4334897 0.9108517
## sample estimates:
## odds ratio
## 0.6290542
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(chemo)) +
geom_bar(colour="black", alpha=0.3)
# No missed data or discrepansies in x-ray treatment
# (when compare XRTBrCHAR, XRTBreast.x and XRTBreast.y)
XRTBrCHAR[1:5]
## [1] 0 0 0 1 0
sum(XRTBreast.x != XRTBreast.y)
## [1] 0
sum(XRTBrCHAR != XRTBreast.x)
## [1] 0
# Unbalanced cases and controls in treated/non-treated by radiotherapy
ggplot(pheno.df) +
aes(as.factor(XRTBrCHAR), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
# There was more x_ray_treatment in controls
table(cc.x, XRTBrCHAR)
## XRTBrCHAR
## cc.x 0 1
## 0 103 144
## 1 136 110
fisher.test(table(cc.x, XRTBrCHAR))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, XRTBrCHAR)
## p-value = 0.002933
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.3985810 0.8394647
## sample estimates:
## odds ratio
## 0.5791743
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(XRTBrCHAR)) +
geom_bar(colour="black", alpha=0.3)
Ethnisity: todo PCI co-plotting with known populations Recalculate eigenvectors?
# Eigenvectors - duplicated
sum(Eigen_1.x != Eigen_1.y)
## [1] 0
sum(Eigen_2.x != Eigen_2.y)
## [1] 0
sum(Eigen_3.x != Eigen_3.y)
## [1] 0
sum(Eigen_4.x != Eigen_4.y)
## [1] 0
sum(Eigen_5.x != Eigen_5.y)
## [1] 0
# setno (unknown field)
setno.x[1:5]
## [1] 382125 204356 360683 226881 357431
sum(setno.x != setno.y)
## [1] 0
t.test(setno.x~cc.x)
##
## Welch Two Sample t-test
##
## data: setno.x by cc.x
## t = -0.077993, df = 491, p-value = 0.9379
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -10863.19 10033.68
## sample estimates:
## mean in group 0 mean in group 1
## 304727.1 305141.9
ggplot(pheno.df) +
aes(setno.x, fill=as.factor(cc.x)) +
geom_density(colour="black", alpha=0.3)
# Registry
registry.x[1:5]
## [1] de IR IR SE SE
## Levels: de IA IR LA ne SE
sum(as.vector(registry.x) != as.vector(registry.y))
## [1] 0
fisher.test(table(cc.x,registry.x))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, registry.x)
## p-value = 1
## alternative hypothesis: two.sided
ggplot(pheno.df) +
aes(registry.x, fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
# Good location (unknown field)
good_location.x[1:5]
## [1] 1 1 0 1 1
sum(good_location.x != good_location.y)
## [1] 0
fisher.test(table(cc.x,good_location.x))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, good_location.x)
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.6049165 1.6370561
## sample estimates:
## odds ratio
## 0.9951316
ggplot(pheno.df) +
aes(good_location.x, fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
# Status is equal to cc
status.x[1:5]
## [1] 0 0 0 0 0
sum(status.x != status.y)
## [1] 0
table(cc.x, status.x)
## status.x
## cc.x 0 1
## 0 247 0
## 1 0 246
sum(status2.x != status2.y)
## [1] 0
sum(status.x != status2.y)
## [1] 0
# Race
sum(race.x != race.y)
## [1] 0
sum(sub_race != race.y)
## [1] 0
sum(sub_race)
## [1] 0
# dose
dose[1:5]
## [1] no dose no dose no dose ls 1Gy no dose
## Levels: ge 1Gy ls 1Gy no dose
class(dose)
## [1] "factor"
summary(dose)
## ge 1Gy ls 1Gy no dose
## 107 150 236
ggplot(pheno.df) +
aes(as.factor(dose), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
# More x-ray treatment in controls
fisher.test(table(cc.x, dose))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, dose)
## p-value = 0.02258
## alternative hypothesis: two.sided
# Balanced doses in x-ray treated cases and controls
pheno.df %>%
select(cc.x,dose) %>%
filter(dose %in% c("ge 1Gy","ls 1Gy")) %>%
table %>%
fisher.test
##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value = 0.8002
## alternative hypothesis: two.sided
# No missed data in dose
sum(is.na(dose))
## [1] 0
# dose_caseloc
dose_caseloc[1:5]
## [1] 0.00 0.00 0.00 0.96 NA
class(dose_caseloc)
## [1] "numeric"
ggplot(pheno.df) +
aes(dose_caseloc, fill=as.factor(cc.x)) +
geom_histogram(colour="black", alpha=0.3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (stat_bin).
ggplot(pheno.df) +
aes(dose_caseloc, fill=as.factor(cc.x)) +
geom_density(colour="black", alpha=0.3)
## Warning: Removed 3 rows containing non-finite values (stat_density).
sum(dose_caseloc == 0, na.rm=TRUE)
## [1] 233
sum(dose_caseloc != 0 & dose_caseloc < 1, na.rm=TRUE)
## [1] 150
sum(dose_caseloc >=1, na.rm=TRUE)
## [1] 107
sum(is.na(dose_caseloc))
## [1] 3
# Numerically coded dose (0,1,2)
dose_num[1:5]
## [1] 0 0 0 1 NA
class(dose_num)
## [1] "numeric"
ggplot(pheno.df) +
aes(dose_num, fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
## Warning: Removed 3 rows containing non-finite values (stat_count).
table(dose_num)
## dose_num
## 0 1 2
## 233 150 107
sum(is.na(dose_num))
## [1] 3
# Disbalance btw cases and controls (higher doses in controls?)
table(cc.x,dose_num)
## dose_num
## cc.x 0 1 2
## 0 101 83 61
## 1 132 67 46
fisher.test(table(cc.x,dose_num))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, dose_num)
## p-value = 0.01868
## alternative hypothesis: two.sided
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(dose_num)) +
geom_bar(colour="black", alpha=0.3)
# Avedose (unknown dose)
avedose[1:5]
## [1] 0.00 0.00 0.00 1.65 NA
class(avedose)
## [1] "numeric"
ggplot(pheno.df) +
aes(avedose, fill=as.factor(cc.x)) +
geom_histogram(colour="black", alpha=0.3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (stat_bin).
ggplot(pheno.df) +
aes(avedose, fill=as.factor(cc.x)) +
geom_density(colour="black", alpha=0.3)
## Warning: Removed 3 rows containing non-finite values (stat_density).
# Disbalance between cases and controls
t.test(avedose~cc.x)
##
## Welch Two Sample t-test
##
## data: avedose by cc.x
## t = 3.0094, df = 480.24, p-value = 0.002755
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.07246196 0.34508906
## sample estimates:
## mean in group 0 mean in group 1
## 0.7741633 0.5653878
# More balanced amongst treated?
pheno.df %>% filter(avedose!=0) %>%
t.test(data=.,avedose~cc.x)
##
## Welch Two Sample t-test
##
## data: avedose by cc.x
## t = 1.224, df = 252.96, p-value = 0.2221
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.05560374 0.23822788
## sample estimates:
## mean in group 0 mean in group 1
## 1.317153 1.225841
# Inconsistent data on endocrine treatment
# Apparently 9 patients did not received hormones,
# while receiving tamoxifen??
table(hormone,tmx)
## tmx
## hormone 0 1
## 0 375 9
## 1 23 85
# horm_tmx is different from tmx
# Apparently 0 is no hormonal treatment
# What is coded as 1 and 2 ??
horm_tmx[1:5]
## [1] 0 1 1 0 2
ggplot(pheno.df) +
aes(as.factor(horm_tmx), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(horm_tmx)
## horm_tmx
## 0 1 2
## 375 94 24
sum(is.na(horm_tmx))
## [1] 0
# Tmx
tmx[1:5]
## [1] 0 1 1 0 0
barplot(table(tmx))
table(tmx)
## tmx
## 0 1
## 399 94
sum(is.na(tmx))
## [1] 0
# Treatment by age: hormones more often are given to older patients
x <- !is.na(hormone)
ggplot(pheno.df[x,]) +
aes(x = sub_dx_age.x, fill = as.factor(hormone)) +
geom_density(colour="black", alpha=0.3)
rm(x)
t.test(sub_dx_age.x~hormone)
##
## Welch Two Sample t-test
##
## data: sub_dx_age.x by hormone
## t = -2.7477, df = 199.2, p-value = 0.006552
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.326010 -0.382323
## sample estimates:
## mean in group 0 mean in group 1
## 41.97917 43.33333
# Reminder: chemo is identical to chemo.x/y
chemo[1:5]
## [1] 1 1 0 0 1
ggplot(pheno.df) +
aes(as.factor(chemo), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(chemo)
## chemo
## 0 1
## 244 249
sum(is.na(chemo))
## [1] 0
# Treatment by age: cytotoxic are more often given to younger patients
ggplot(pheno.df) +
aes(x = sub_dx_age.x, fill = as.factor(chemo)) +
geom_density(colour="black", alpha=0.3)
t.test(sub_dx_age.x~chemo)
##
## Welch Two Sample t-test
##
## data: sub_dx_age.x by chemo
## t = 6.0245, df = 461.28, p-value = 3.47e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.761803 3.467540
## sample estimates:
## mean in group 0 mean in group 1
## 43.61066 40.99598
# cmf uses wrong encodings
cmf[1:5]
## [1] CMF Oth no no CMF
## Levels: \xb0\x9b@ \024\x9c@ CMF no Oth
ggplot(pheno.df) +
aes(as.factor(cmf), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(cmf)
## cmf
## \xb0\x9b@ \024\x9c@ CMF no Oth
## 0 0 0 153 244 96
sum(is.na(cmf))
## [1] 0
# cmf012 is numeric cmf
# 0= none, 1=cmf, 2=other (the order is not meaningful)
cmf_012[1:5]
## [1] 1 2 0 0 1
ggplot(pheno.df) +
aes(as.factor(cmf_012), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(cmf_012)
## cmf_012
## 0 1 2
## 244 153 96
sum(is.na(cmf_012))
## [1] 0
# CMF is coded better than cmf
CMF[1:5]
## [1] CMF Oth no no Oth
## Levels: CMF no Oth
ggplot(pheno.df) +
aes(as.factor(CMF), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(CMF)
## CMF
## CMF no Oth
## 131 244 118
sum(is.na(CMF))
## [1] 0
# CMF vs age
ggplot(pheno.df) +
aes(x = sub_dx_age.x, fill = as.factor(CMF)) +
geom_density(colour="black", alpha=0.3)
# CMF is different from cmf
sum(as.vector(cmf) != as.vector(CMF))
## [1] 22
# CMF only
CMF_Only[1:5]
## [1] 1 0 0 0 0
ggplot(pheno.df) +
aes(as.factor(CMF_Only), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(CMF_Only)
## CMF_Only
## 0 1
## 362 131
sum(is.na(CMF_Only))
## [1] 0
# # Treatment by age: patients given cytotoxics are younger than those given hormones
ggplot() +
geom_density(data=pheno.df[chemo==1,], aes(sub_dx_age.x), colour="black", fill="red", alpha=0.25) +
geom_density(data=pheno.df[hormone==1,], aes(sub_dx_age.x), colour="black", fill="blue", alpha=0.25)
## Warning: Removed 1 rows containing non-finite values (stat_density).
t.test(sub_dx_age.x[chemo==1],sub_dx_age.x[hormone==1])
##
## Welch Two Sample t-test
##
## data: sub_dx_age.x[chemo == 1] and sub_dx_age.x[hormone == 1]
## t = -4.3157, df = 252.3, p-value = 2.286e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.403967 -1.270731
## sample estimates:
## mean of x mean of y
## 40.99598 43.33333
# Stage
stage_fd[1:5]
## [1] 2 1 1 1 2
summary(as.factor(stage_fd))
## 1 2
## 344 149
ggplot(pheno.df) +
aes(as.factor(stage_fd), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
# No significant differences by stage between cases and controls
# However, tendency to higher stage in controls,
# which might had caused more cytotoxics and x-ray in controls
fisher.test(table(cc.x, stage_fd))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, stage_fd)
## p-value = 0.07746
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.4641429 1.0446625
## sample estimates:
## odds ratio
## 0.6973788
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(stage_fd)) +
geom_bar(colour="black", alpha=0.3)
# Histology
# Does not make sense to pool nst and tubular?
Histology[1:5]
## [1] other other other medullar other
## Levels: lobular medullar other r othe
unique(as.vector(Histology))
## [1] "other" "medullar" "lobular"
ggplot(pheno.df) +
aes(Histology, fill=Histology) +
geom_bar(colour="black", alpha=0.3)
table(Histology)
## Histology
## lobular medullar other r othe
## 50 13 430 0
# Histology1 - swop between lobular and other
Histology1[1:5]
## [1] other other other medullar other
## Levels: lobular medullar other r othe
unique(as.vector(Histology1))
## [1] "other" "medullar" "lobular"
ggplot(pheno.df) +
aes(Histology1, fill=Histology1) +
geom_bar(colour="black", alpha=0.3)
table(Histology1)
## Histology1
## lobular medullar other r othe
## 36 13 444 0
sum(as.vector(Histology) != as.vector(Histology1))
## [1] 14
pheno.df %>%
select(labid.x, Histology, Histology1) %>%
filter(as.vector(Histology) != as.vector(Histology1))
## labid.x Histology Histology1
## 1 id203568 lobular other
## 2 id216437 lobular other
## 3 id246347 lobular other
## 4 id260716 lobular other
## 5 id270215 lobular other
## 6 id275762 lobular other
## 7 id278053 lobular other
## 8 id298289 lobular other
## 9 id301831 lobular other
## 10 id310684 lobular other
## 11 id320617 lobular other
## 12 id324813 lobular other
## 13 id337867 lobular other
## 14 id371301 lobular other
#pheno.df[
# as.vector(Histology) != as.vector(Histology1),
# c("labid.x","Histology","Histology1")]
# histol_cat
histo1_cat[1:5]
## [1] unknown ductal ductal medullary ductal
## 9 Levels: al posi al unkn ductal lobular ... unknown
unique(as.vector(histo1_cat))
## [1] "unknown" "ductal" "medullary"
## [4] "lobular" "Tubular/mucinous" "other carcinoma"
# Cases and controls are balanced by histology
ggplot(pheno.df) +
aes(histo1_cat, fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(cc.x, histo1_cat)
## histo1_cat
## cc.x al posi al unkn ductal lobular medullary
## 0 0 0 205 19 6
## 1 0 0 190 31 7
## histo1_cat
## cc.x other carcinoma r carcinoma posi Tubular/mucinous unknown
## 0 8 0 8 1
## 1 10 0 8 0
fisher.test(table(cc.x, histo1_cat))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, histo1_cat)
## p-value = 0.4383
## alternative hypothesis: two.sided
# Balance within ductal and lobular:
# non-significant trend to have more lobular in CBC
pheno.df %>%
filter(histo1_cat %in% c("ductal", "lobular")) %>%
ggplot() +
aes(histo1_cat, fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
x <- table(
as.vector(cc.x[histo1_cat == "ductal" | histo1_cat == "lobular"]),
as.vector(histo1_cat[histo1_cat == "ductal" | histo1_cat == "lobular"]))
x
##
## ductal lobular
## 0 205 19
## 1 190 31
fisher.test(x)
##
## Fisher's Exact Test for Count Data
##
## data: x
## p-value = 0.07223
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.9266881 3.4130751
## sample estimates:
## odds ratio
## 1.758151
rm(x)
# Lobular only
lobular[1:5]
## [1] 0 0 0 0 0
unique(as.vector(lobular))
## [1] 0 1
ggplot(pheno.df) +
aes(as.factor(lobular), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(lobular)
## lobular
## 0 1
## 443 50
# Hist_lob_other
Hist_lob_other[1:5]
## [1] other other other other other
## Levels: lobular other r duct r othe
unique(as.vector(Hist_lob_other))
## [1] "other" "lobular"
ggplot(pheno.df) +
aes(Hist_lob_other, fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(Hist_lob_other)
## Hist_lob_other
## lobular other r duct r othe
## 50 443 0 0
check consistency with treatment? much more more er+ ve? many er unknown
# er1
er1[1:5]
## [1] 1 1 1 NA 1
class(er1)
## [1] "integer"
unique(as.vector(er1))
## [1] 1 NA 0
ggplot(pheno.df) +
aes(as.factor(er1), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
summary(as.factor(er1))
## 0 1 NA's
## 112 231 150
# Balanced between cases and controls
table(cc.x, er1)
## er1
## cc.x 0 1
## 0 63 114
## 1 49 117
fisher.test(table(cc.x,er1))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, er1)
## p-value = 0.2504
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.8178203 2.1335103
## sample estimates:
## odds ratio
## 1.31847
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(er1)) +
geom_bar(colour="black", alpha=0.3)
# er1_num (=er1)
er1_num[1:5]
## [1] 1 1 1 NA 1
class(er1_num)
## [1] "numeric"
unique(as.vector(er1_num))
## [1] 1 NA 0
ggplot(pheno.df) +
aes(as.factor(er1_num), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(as.factor(er1_num))
##
## 0 1
## 112 231
sum(is.na(er1_num))
## [1] 150
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(er1_num)) +
geom_bar(colour="black", alpha=0.3)
sum(as.vector(er1) != as.vector(er1), na.rm = TRUE)
## [1] 0
# er1_cat
er1_cat[1:5]
## [1] positive positive positive unknown positive
## Levels: negative own unkn positive tiveposi unknown
class(er1_cat)
## [1] "factor"
unique(as.vector(er1_cat))
## [1] "positive" "unknown" "negative"
ggplot(pheno.df) +
aes(as.factor(er1_cat), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(as.vector(er1_cat))
##
## negative positive unknown
## 113 231 149
sum(is.na(er1_cat))
## [1] 0
# Balanced between cases and controls
x <- table(cc.x, er1_cat)[,c(1,3,5)]
x
## er1_cat
## cc.x negative positive unknown
## 0 64 114 69
## 1 49 117 80
fisher.test(x)
##
## Fisher's Exact Test for Count Data
##
## data: x
## p-value = 0.2429
## alternative hypothesis: two.sided
rm(x)
# er_fd
er_fd[1:5]
## [1] 1 1 1 4 1
class(er_fd)
## [1] "numeric"
unique(as.vector(er_fd))
## [1] 1 4 2 0 9 3
ggplot(pheno.df) +
aes(as.factor(er_fd), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(as.vector(er_fd))
##
## 0 1 2 3 4 9
## 103 231 112 17 4 26
sum(is.na(er_fd))
## [1] 0
# pr1_cat
pr1_cat[1:5]
## [1] positive positive positive negative positive
## 7 Levels: negative own 0 Ot positive tive01Ot tive11no ... unknown
class(pr1_cat)
## [1] "factor"
unique(as.vector(pr1_cat))
## [1] "positive" "negative" "unknown"
ggplot(pheno.df) +
aes(as.factor(pr1_cat), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(as.vector(pr1_cat))
##
## negative positive unknown
## 95 188 210
sum(is.na(pr1_cat))
## [1] 0
# Balanced between cases and controls
x <- table(cc.x, pr1_cat)[,c(1,3,7)]
fisher.test(x)
##
## Fisher's Exact Test for Count Data
##
## data: x
## p-value = 0.4529
## alternative hypothesis: two.sided
rm(x)
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(pr1_cat)) +
geom_bar(colour="black", alpha=0.3)
# pr correlate with er
table(er1, pr1_cat)[,c(1,3)]
## pr1_cat
## er1 negative positive
## 0 66 23
## 1 26 163
fisher.test(table(er1, pr1_cat)[,c(1,3)])
##
## Fisher's Exact Test for Count Data
##
## data: table(er1, pr1_cat)[, c(1, 3)]
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 9.173234 35.515099
## sample estimates:
## odds ratio
## 17.71474
table(er1, pr1_cat)[,c(1,3,7)]
## pr1_cat
## er1 negative positive unknown
## 0 66 23 23
## 1 26 163 42
fisher.test(table(er1, pr1_cat)[,c(1,3,7)])
##
## Fisher's Exact Test for Count Data
##
## data: table(er1, pr1_cat)[, c(1, 3, 7)]
## p-value < 2.2e-16
## alternative hypothesis: two.sided
# Combined er and pr field
pheno.df <- pheno.df %>% mutate(epr=paste(er1_cat, pr1_cat))
# Look at combined faceted plots
ggplot(pheno.df) +
aes( as.factor(cc.x), fill=as.factor(epr)) +
geom_bar(colour="black", alpha=0.3)
ggplot(pheno.df) +
aes( as.factor(cc.x), fill=as.factor(epr)) +
geom_bar(colour="black", alpha=0.3) +
facet_grid(er1~pr1_cat)
# Excess of controls in ER-ve, PgR-ve
# Excess of controls in ER-ve, PgR+ve??
# Balanced cases and controls in ER+ve, PgR-ve
# Balanced cases and controls in ER+ve, PgR+ve
# ===================================================== #
# pr_fd
pr_fd[1:5]
## [1] 1 1 1 2 1
class(pr_fd)
## [1] "numeric"
unique(as.vector(pr_fd))
## [1] 1 2 0 9 3 4
ggplot(pheno.df) +
aes(as.factor(pr_fd), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(as.vector(pr_fd))
##
## 0 1 2 3 4 9
## 157 188 94 16 5 33
fisher.test(table(cc.x,pr_fd))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, pr_fd)
## p-value = 0.6308
## alternative hypothesis: two.sided
sum(is.na(pr_fd))
## [1] 0
# age_menarche (categorical)
age_menarche[1:5]
## [1] 1 1 1 1 0
class(age_menarche)
## [1] "integer"
unique(age_menarche)
## [1] 1 0
ggplot(pheno.df) +
aes(as.factor(age_menarche), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(age_menarche)
## age_menarche
## 0 1
## 228 265
sum(is.na(age_menarche))
## [1] 0
# Balanced between cases and controls
table(cc.x, age_menarche)
## age_menarche
## cc.x 0 1
## 0 109 138
## 1 119 127
fisher.test(table(cc.x,age_menarche))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, age_menarche)
## p-value = 0.3669
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.5821688 1.2204481
## sample estimates:
## odds ratio
## 0.843251
sum(is.na(age_menarche))
## [1] 0
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(age_menarche)) +
geom_bar(colour="black", alpha=0.3)
# Age menarche (numeric)
rh_age_menarche[1:5]
## [1] 13 14 13 13 12
class(rh_age_menarche)
## [1] "numeric"
unique(rh_age_menarche)
## [1] 13 14 12 9 11 10 15 18 16 17 0 19
hist(rh_age_menarche)
ggplot(pheno.df) +
aes(rh_age_menarche, fill=as.factor(cc.x)) +
geom_histogram(colour="black", alpha=0.3, binwidth=1) +
stat_bin(binwidth=1, geom="text", aes(label=..count..), vjust=-1)
ggplot(pheno.df) +
aes(rh_age_menarche, fill=as.factor(cc.x)) +
geom_histogram(colour="black", alpha=0.3, binwidth=1)
table(cc.x, rh_age_menarche)
## rh_age_menarche
## cc.x 0 9 10 11 12 13 14 15 16 17 18 19
## 0 2 7 7 30 63 72 38 16 8 3 1 0
## 1 2 5 14 28 70 69 29 19 5 3 1 1
sum(is.na(rh_age_menarche))
## [1] 0
# Balanced between cases and controls
t.test(rh_age_menarche~cc.x)
##
## Welch Two Sample t-test
##
## data: rh_age_menarche by cc.x
## t = 0.51979, df = 490.82, p-value = 0.6034
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2525971 0.4343225
## sample estimates:
## mean in group 0 mean in group 1
## 12.64777 12.55691
# odd density plot behaving ...
ggplot(pheno.df) +
aes(rh_age_menarche, fill=as.factor(cc.x)) +
geom_density(colour="black", alpha=0.3)
# adjusted plot
ggplot(pheno.df) +
aes(rh_age_menarche, fill=as.factor(cc.x)) +
geom_density(colour="black", alpha=0.3, adjust=3)
# age_menopause: 0,1,2 (pre, in, post?)
age_menopause[1:5]
## [1] 0 2 0 0 0
class(age_menopause)
## [1] "integer"
unique(age_menopause)
## [1] 0 2 1
ggplot(pheno.df) +
aes(as.factor(age_menopause), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(age_menopause)
## age_menopause
## 0 1 2
## 400 72 21
sum(is.na(age_menopause))
## [1] 0
# Balanced cases and controls
table(cc.x, age_menopause)
## age_menopause
## cc.x 0 1 2
## 0 201 37 9
## 1 199 35 12
fisher.test(table(cc.x, age_menopause))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, age_menopause)
## p-value = 0.793
## alternative hypothesis: two.sided
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(age_menopause)) +
geom_bar(colour="black", alpha=0.3)
# age_menopause_1yrbf_fd: what is this?
# Does not look as menopausal age...
age_menopause_1yrbf_fd[1:5]
## [1] -1 48 -1 -1 -1
class(age_menopause_1yrbf_fd)
## [1] "numeric"
unique(age_menopause_1yrbf_fd)
## [1] -1 48 49 29 33 46 31 41 43 25 45 39 40 36 35 28 27 38 NA 42 47 34 32
## [24] 44 52 26 37
ggplot(pheno.df) +
aes(age_menopause_1yrbf_fd, fill=as.factor(cc.x)) +
geom_histogram(colour="black", alpha=0.3, binwidth=1)
## Warning: Removed 3 rows containing non-finite values (stat_bin).
table(age_menopause_1yrbf_fd)
## age_menopause_1yrbf_fd
## -1 25 26 27 28 29 31 32 33 34 35 36 37 38 39 40 41 42
## 397 1 1 2 2 5 3 2 4 3 5 5 3 3 1 6 7 8
## 43 44 45 46 47 48 49 52
## 7 4 6 3 5 4 2 1
sum(is.na(age_menopause_1yrbf_fd))
## [1] 3
# Balanced between cases and controls
t.test(age_menopause_1yrbf_fd~cc.x)
##
## Welch Two Sample t-test
##
## data: age_menopause_1yrbf_fd by cc.x
## t = -0.37457, df = 486.79, p-value = 0.7081
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.384364 2.300597
## sample estimates:
## mean in group 0 mean in group 1
## 6.348361 6.890244
pheno.df %>%
filter(age_menopause_1yrbf_fd != -1) %>%
t.test(data=., age_menopause_1yrbf_fd~cc.x)
##
## Welch Two Sample t-test
##
## data: age_menopause_1yrbf_fd by cc.x
## t = -1.7734, df = 89.443, p-value = 0.07957
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.918452 0.279229
## sample estimates:
## mean in group 0 mean in group 1
## 37.97826 40.29787
pheno.df %>%
filter(age_menopause_1yrbf_fd != -1) %>%
ggplot() +
aes(age_menopause_1yrbf_fd, fill=as.factor(cc.x)) +
geom_histogram(colour="black", alpha=0.3, binwidth=1)
pheno.df %>%
filter(age_menopause_1yrbf_fd != -1) %>%
ggplot() +
aes(age_menopause_1yrbf_fd, fill=as.factor(cc.x)) +
geom_density(colour="black", alpha=0.3)
# Of those who is in mp there is trent to have younger mp age in controls
# This may be explained by more cytotoxic treatment in this group
# age_1fftp_fd: what is this? another age of a childbearing?
# ftp - in my prev life meant "full term pregnancy"?
# Less pregnancies in cases than in controls?
age_1fftp_fd[1:5]
## [1] 20 22 0 29 28
class(age_1fftp_fd)
## [1] "numeric"
unique(age_1fftp_fd)
## [1] 20 22 0 29 28 27 24 30 21 23 25 26 38 19 40 17 31 18 39 37 32 33 16
## [24] 36 35 34 15
ggplot(pheno.df) +
aes(age_1fftp_fd, fill=as.factor(cc.x)) +
geom_histogram(colour="black", alpha=0.3, binwidth=1)
ggplot(pheno.df) +
aes(age_1fftp_fd, fill=as.factor(cc.x)) +
geom_density(colour="black", alpha=0.3)
pheno.df %>% filter(age_1fftp_fd!=0) %>%
ggplot() +
aes(age_1fftp_fd, fill=as.factor(cc.x)) +
geom_density(colour="black", alpha=0.3)
pheno.df %>% filter(age_1fftp_fd!=0) %>%
t.test(data=.,age_1fftp_fd~cc.x)
##
## Welch Two Sample t-test
##
## data: age_1fftp_fd by cc.x
## t = -0.88651, df = 393.25, p-value = 0.3759
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.4164195 0.5360263
## sample estimates:
## mean in group 0 mean in group 1
## 24.85514 25.29534
table(age_1fftp_fd)
## age_1fftp_fd
## 0 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
## 86 1 3 7 10 26 33 29 32 38 28 35 22 20 29 16 19 16 12 6 5 4 4 4 2
## 39 40
## 3 3
sum(is.na(age_1fftp_fd))
## [1] 0
# Balanced between cases and controls
t.test(age_1fftp_fd~cc.x)
##
## Welch Two Sample t-test
##
## data: age_1fftp_fd by cc.x
## t = 1.7818, df = 476.59, p-value = 0.07541
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1735768 3.5513458
## sample estimates:
## mean in group 0 mean in group 1
## 21.53441 19.84553
pheno.df %>%
filter(age_1fftp_fd != 0) %>%
t.test(data=., age_1fftp_fd~cc.x)
##
## Welch Two Sample t-test
##
## data: age_1fftp_fd by cc.x
## t = -0.88651, df = 393.25, p-value = 0.3759
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.4164195 0.5360263
## sample estimates:
## mean in group 0 mean in group 1
## 24.85514 25.29534
# Num_ftp_fd - what would it be?
# Num of full time pregnancies???
# The numnbers do not make sence and do not match the num_preg
Num_ftp_fd[1:5]
## [1] 1 2 -1 2 2
class(Num_ftp_fd)
## [1] "numeric"
unique(Num_ftp_fd)
## [1] 1 2 -1 3 4 0 5 6 7
ggplot(pheno.df) +
aes(as.factor(Num_ftp_fd), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(Num_ftp_fd)) +
geom_bar(colour="black", alpha=0.3)
table(cc.x, Num_ftp_fd)
## Num_ftp_fd
## cc.x -1 0 1 2 3 4 5 6 7
## 0 23 10 38 106 45 19 4 1 1
## 1 31 22 45 104 32 10 2 0 0
sum(is.na(Num_ftp_fd))
## [1] 0
# Balanced in cases and controls
#fisher.test(table(cc.x, Num_ftp_fd)) # can not calculate ...
fisher.test(table(cc.x, Num_ftp_fd)[,1:5]) # non-sign trend
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, Num_ftp_fd)[, 1:5]
## p-value = 0.08616
## alternative hypothesis: two.sided
# num_preg
num_preg[1:5]
## [1] 1 1 0 1 1
class(num_preg)
## [1] "numeric"
unique(num_preg)
## [1] 1 0 2
ggplot(pheno.df) +
aes(as.factor(num_preg), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
sum(is.na(num_preg))
## [1] 0
# Less pregnancies in cases
table(cc.x, num_preg)
## num_preg
## cc.x 0 1 2
## 0 33 189 25
## 1 53 181 12
sum(is.na(num_preg))
## [1] 0
fisher.test(table(cc.x, num_preg))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, num_preg)
## p-value = 0.009058
## alternative hypothesis: two.sided
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(num_preg)) +
geom_bar(colour="black", alpha=0.3)
# No CBC with 3-pregnancies ...
this is about lafelong exposure to Eg
in pre-menopause it is protective this is about endocrine disorders and disruption of ms-cyclingand about cumulative lafelong exposure to Eg too speculative …
# BMI_age18
BMI_age18[1:5]
## [1] 20.20202 19.73983 23.29737 19.46995 25.84315
class(BMI_age18)
## [1] "numeric"
ggplot(pheno.df) +
aes(BMI_age18, fill=as.factor(cc.x)) +
geom_histogram(colour="black", alpha=0.3, binwidth=1)
## Warning: Removed 2 rows containing non-finite values (stat_bin).
ggplot(pheno.df) +
aes(BMI_age18, fill=as.factor(cc.x)) +
geom_density(colour="black", alpha=0.3)
## Warning: Removed 2 rows containing non-finite values (stat_density).
# controls tended to have slightly higher BMI at age of 18
# Caused by obese cases, consistent with protective role of
# obesity in pre-menopause
t.test(BMI_age18~cc.x)
##
## Welch Two Sample t-test
##
## data: BMI_age18 by cc.x
## t = 2.1121, df = 470.08, p-value = 0.03521
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.03452471 0.95732461
## sample estimates:
## mean in group 0 mean in group 1
## 20.64919 20.15327
summary(BMI_age18)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13.56 18.79 20.08 20.40 21.63 34.96 2
sum(is.na(BMI_age18))
## [1] 2
# BMI_dx
BMI_dx[1:5]
## [1] 22.77319 20.94139 23.29737 32.61632 25.84315
class(BMI_dx)
## [1] "numeric"
ggplot(pheno.df) +
aes(BMI_dx, fill=as.factor(cc.x)) +
geom_histogram(colour="black", alpha=0.3, binwidth=1)
## Warning: Removed 3 rows containing non-finite values (stat_bin).
ggplot(pheno.df) +
aes(BMI_dx, fill=as.factor(cc.x)) +
geom_density(colour="black", alpha=0.3)
## Warning: Removed 3 rows containing non-finite values (stat_density).
# controls had higher BMI at Ds
t.test(BMI_dx~cc.x)
##
## Welch Two Sample t-test
##
## data: BMI_dx by cc.x
## t = 2.5725, df = 458.17, p-value = 0.01041
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.2469653 1.8452137
## sample estimates:
## mean in group 0 mean in group 1
## 23.93155 22.88546
summary(BMI_dx)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 14.10 20.37 22.31 23.41 25.09 58.77 3
sum(is.na(BMI_dx))
## [1] 3
# BMI_age18 vs BMI_dx
ggplot(pheno.df) +
aes(x=BMI_age18, y=BMI_dx, colour=as.factor(cc.x)) +
geom_point(alpha=0.3) +
geom_smooth(method="lm")
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).
# BMI_ref
BMI_ref[1:5]
## [1] 25.71166 21.97129 25.79352 32.61632 28.05828
class(BMI_ref)
## [1] "numeric"
ggplot(pheno.df) +
aes(BMI_ref, fill=as.factor(cc.x)) +
geom_histogram(colour="black", alpha=0.3, binwidth=1)
## Warning: Removed 3 rows containing non-finite values (stat_bin).
ggplot(pheno.df) +
aes(BMI_ref, fill=as.factor(cc.x)) +
geom_density(colour="black", alpha=0.3)
## Warning: Removed 3 rows containing non-finite values (stat_density).
summary(BMI_ref)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 15.66 21.30 23.37 24.64 26.76 58.77 3
sum(is.na(BMI_ref))
## [1] 3
# BMI_ref vs BMI_dx
ggplot(pheno.df) +
aes(x=BMI_ref, y=BMI_dx, colour=as.factor(cc.x)) +
geom_point(alpha=0.3) +
geom_smooth(method="lm")
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).
# DOB
DOB[1:5]
## [1] -5049 -7459 -3916 -5890 -6407
class(DOB)
## [1] "numeric"
hist(DOB)
ggplot(pheno.df) +
aes(DOB, fill=as.factor(cc.x)) +
geom_histogram(colour="black", alpha=0.3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(pheno.df) +
aes(DOB, fill=as.factor(cc.x)) +
geom_density(colour="black", alpha=0.3)
# dxyr_stratum
dxyr_stratum[1:5]
## [1] 2 2 3 1 1
class(dxyr_stratum)
## [1] "integer"
summary(dxyr_stratum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 1.842 2.000 4.000
ggplot(pheno.df) +
aes(dxyr_stratum, fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
# Absolutely balanced between cases and controls
table(dxyr_stratum)
## dxyr_stratum
## 1 2 3 4
## 214 161 100 18
sum(is.na(dxyr_stratum))
## [1] 0
fisher.test(table(cc.x,dxyr_stratum))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, dxyr_stratum)
## p-value = 1
## alternative hypothesis: two.sided
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(dxyr_stratum)) +
geom_bar(colour="black", alpha=0.3)
# age_stratum1
age_stratum1[1:5]
## [1] 45to49 45to49 45to49 40to44 40to44
## Levels: 20to34 35to39 40to44 45to49 50to54
class(age_stratum1)
## [1] "factor"
summary(age_stratum1)
## 20to34 35to39 40to44 45to49 50to54
## 40 92 218 133 10
ggplot(pheno.df) +
aes(age_stratum1, fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(age_stratum1)
## age_stratum1
## 20to34 35to39 40to44 45to49 50to54
## 40 92 218 133 10
sum(is.na(age_stratum1))
## [1] 0
# Absolutely balanced between cases and controls
table(age_stratum1)
## age_stratum1
## 20to34 35to39 40to44 45to49 50to54
## 40 92 218 133 10
sum(is.na(age_stratum1))
## [1] 0
fisher.test(table(cc.x,age_stratum1))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, age_stratum1)
## p-value = 1
## alternative hypothesis: two.sided
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(age_stratum1)) +
geom_bar(colour="black", alpha=0.3)
# sub_dx_age_cat
sub_dx_age_cat[1:5]
## [1] 0 0 0 1 1
class(sub_dx_age_cat)
## [1] "integer"
summary(sub_dx_age_cat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 1.0000 0.6613 1.0000 1.0000
ggplot(pheno.df) +
aes(sub_dx_age_cat, fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
table(sub_dx_age_cat)
## sub_dx_age_cat
## 0 1
## 167 326
sum(is.na(sub_dx_age_cat))
## [1] 0
# Balanced between cases and controls
fisher.test(table(cc.x,sub_dx_age_cat))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, sub_dx_age_cat)
## p-value = 0.7758
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.6369474 1.3909684
## sample estimates:
## odds ratio
## 0.941456
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(sub_dx_age_cat)) +
geom_bar(colour="black", alpha=0.3)
# refage
refage[1:5]
## [1] 53 59 51 50 52
class(refage)
## [1] "numeric"
summary(refage)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 27.00 44.00 48.00 47.81 52.00 70.00
ggplot(pheno.df) +
aes(refage, fill=as.factor(cc.x)) +
geom_histogram(colour="black", alpha=0.3, binwidth=1)
ggplot(pheno.df) +
aes(refage, fill=as.factor(cc.x)) +
geom_density(colour="black", alpha=0.3)
summary(refage)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 27.00 44.00 48.00 47.81 52.00 70.00
sum(is.na(refage))
## [1] 0
# balanced in cases and controls
t.test(refage~cc.x)
##
## Welch Two Sample t-test
##
## data: refage by cc.x
## t = -0.038983, df = 490.73, p-value = 0.9689
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.211407 1.164273
## sample estimates:
## mean in group 0 mean in group 1
## 47.79757 47.82114
# rstime
rstime[1:5]
## [1] 7.42 9.75 6.09 6.25 9.34
class(rstime)
## [1] "numeric"
summary(rstime)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.330 5.330 5.988 8.250 15.580
ggplot(pheno.df) +
aes(rstime, fill=as.factor(cc.x)) +
geom_histogram(colour="black", alpha=0.3, binwidth=1)
ggplot(pheno.df) +
aes(rstime, fill=as.factor(cc.x)) +
geom_density(colour="black", alpha=0.3)
sum(is.na(rstime))
## [1] 0
# balanced in cases and controls
t.test(refage~cc.x)
##
## Welch Two Sample t-test
##
## data: refage by cc.x
## t = -0.038983, df = 490.73, p-value = 0.9689
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.211407 1.164273
## sample estimates:
## mean in group 0 mean in group 1
## 47.79757 47.82114
ggplot(pheno.df) +
aes(offset.x, fill=as.factor(cc.x)) +
geom_histogram(colour="black", alpha=0.3, binwidth=1)
ggplot(pheno.df) +
geom_density(aes(offset.x), fill="red", colour="black", alpha=0.3) +
geom_density(aes(rstime), fill="blue", colour="black", alpha=0.3)
ggplot(pheno.df) +
geom_density(aes(rstime), fill="red", colour="black", alpha=0.3) +
geom_density(aes(refage - sub_dx_age.x), fill="blue", colour="black", alpha=0.3)
ggplot(pheno.df) +
aes(refage - sub_dx_age.x, fill=as.factor(cc.x)) +
geom_histogram(colour="black", alpha=0.3, binwidth=1)
ggplot(pheno.df) +
aes(refage - sub_dx_age.x, fill=as.factor(cc.x)) +
geom_density(colour="black", alpha=0.3)
ggplot(pheno.df) +
geom_density(aes(refage), fill="red", colour="black", alpha=0.3) +
geom_density(aes(sub_dx_age.x), fill="blue", colour="black", alpha=0.3)
# dsmiss
dsmiss[1:5]
## [1] 0 0 0 0 1
class(dsmiss)
## [1] "integer"
summary(dsmiss)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.006085 0.000000 1.000000
table(dsmiss)
## dsmiss
## 0 1
## 490 3
sum(is.na(dsmiss))
## [1] 0
ggplot(pheno.df) +
aes(dsmiss, fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
# Balanced between cases and controls
fisher.test(table(cc.x,dsmiss))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, dsmiss)
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.008446842 9.676203318
## sample estimates:
## odds ratio
## 0.5006798
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(dsmiss)) +
geom_bar(colour="black", alpha=0.3)
# conf_miss
conf_miss[1:5]
## [1] 0 0 0 0 0
class(conf_miss)
## [1] "numeric"
summary(conf_miss)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.006085 0.000000 1.000000
table(conf_miss)
## conf_miss
## 0 1
## 490 3
sum(is.na(conf_miss))
## [1] 0
ggplot(pheno.df) +
aes(conf_miss, fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
# Balanced between cases and controls
fisher.test(table(cc.x,conf_miss))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, conf_miss)
## p-value = 0.2485
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.000000 2.424045
## sample estimates:
## odds ratio
## 0
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(conf_miss)) +
geom_bar(colour="black", alpha=0.3)
# dsmiss vs conf_miss
sum(dsmiss != conf_miss, na.rm=TRUE)
## [1] 6
# Phase - all 1
Phase[1:5]
## [1] 1 1 1 1 1
class(Phase)
## [1] "numeric"
summary(Phase)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
table(Phase)
## Phase
## 1
## 493
sum(is.na(Phase))
## [1] 0
sum(Phase!=1)
## [1] 0
# Deleterious
Deleterious[1:5]
## [1] 0 0 0 0 0
class(Deleterious)
## [1] "integer"
summary(Deleterious)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.03651 0.00000 9.00000
table(Deleterious)
## Deleterious
## 0 9
## 491 2
sum(is.na(Deleterious))
## [1] 0
ggplot(pheno.df) +
aes(as.factor(Deleterious), fill=as.factor(cc.x)) +
geom_bar(colour="black", alpha=0.3)
# Balanced between cases and controls
fisher.test(table(cc.x,Deleterious))
##
## Fisher's Exact Test for Count Data
##
## data: table(cc.x, Deleterious)
## p-value = 0.2485
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.1886951 Inf
## sample estimates:
## odds ratio
## Inf
ggplot(pheno.df) +
aes(as.factor(cc.x), fill=as.factor(Deleterious)) +
geom_bar(colour="black", alpha=0.3)
# Detach pheno data frame
detach(pheno.df)
# Select columns
pheno.df <- pheno.df %>% select(
labid = labid.x,
cc = cc.x,
family_history = family_history.y,
sub_dx_age = sub_dx_age.x,
refage = refage,
rstime = rstime,
offset = offset.x,
Eigen_1 = Eigen_1.x,
Eigen_2 = Eigen_2.x,
Eigen_3 = Eigen_3.x,
Eigen_4 = Eigen_4.x,
Eigen_5 = Eigen_5.x,
stage_fd = stage_fd,
er1 = er1,
pr1_cat = pr1_cat,
histo1_cat = histo1_cat,
hormone = hormone,
CMF = CMF,
XRTBrCHAR = XRTBrCHAR,
dose_caseloc = dose_caseloc,
rh_age_menarche = rh_age_menarche,
age_1fftp_fd = age_1fftp_fd,
age_menopause_1yrbf_fd = age_menopause_1yrbf_fd,
num_preg = num_preg,
BMI_age18 = BMI_age18,
BMI_dx = BMI_dx,
BMI_ref = BMI_ref)
# Recode categorical pr to NA, 0 and 1
pheno.df <- pheno.df %>% mutate(
pr1 = ifelse(pr1_cat=="negative", 0,
ifelse(pr1_cat=="positive", 1,
NA)))
# Substitute zero age of menarche to NA
pheno.df <- pheno.df %>% mutate(
age_menarche = ifelse(rh_age_menarche==0, NA, rh_age_menarche))
# Remove corrected and recoded fields
pheno.df <- pheno.df %>% select(-rh_age_menarche, -pr1_cat)
# Remove demographics table
rm(demographics.df, odd_cases, duplicated_cases)
# Check data frames
dim(covar.df)
## [1] 498 13
dim(pheno.df)
## [1] 493 27
colnames(covar.df)
## [1] "labid" "cc" "sub_dx_age" "offset" "chemo"
## [6] "hormone" "xrt" "dose" "Eigen_1" "Eigen_2"
## [11] "Eigen_3" "Eigen_4" "Eigen_5"
colnames(pheno.df)
## [1] "labid" "cc"
## [3] "family_history" "sub_dx_age"
## [5] "refage" "rstime"
## [7] "offset" "Eigen_1"
## [9] "Eigen_2" "Eigen_3"
## [11] "Eigen_4" "Eigen_5"
## [13] "stage_fd" "er1"
## [15] "histo1_cat" "hormone"
## [17] "CMF" "XRTBrCHAR"
## [19] "dose_caseloc" "age_1fftp_fd"
## [21] "age_menopause_1yrbf_fd" "num_preg"
## [23] "BMI_age18" "BMI_dx"
## [25] "BMI_ref" "pr1"
## [27] "age_menarche"
# Prepare key for joining
pheno.df$labid <- as.vector(pheno.df$labid)
covar.df$labid <- as.vector(covar.df$labid)
ph.df <- left_join(covar.df, pheno.df, by="labid")
dim(ph.df)
## [1] 498 39
colnames(ph.df)
## [1] "labid" "cc.x"
## [3] "sub_dx_age.x" "offset.x"
## [5] "chemo" "hormone.x"
## [7] "xrt" "dose"
## [9] "Eigen_1.x" "Eigen_2.x"
## [11] "Eigen_3.x" "Eigen_4.x"
## [13] "Eigen_5.x" "cc.y"
## [15] "family_history" "sub_dx_age.y"
## [17] "refage" "rstime"
## [19] "offset.y" "Eigen_1.y"
## [21] "Eigen_2.y" "Eigen_3.y"
## [23] "Eigen_4.y" "Eigen_5.y"
## [25] "stage_fd" "er1"
## [27] "histo1_cat" "hormone.y"
## [29] "CMF" "XRTBrCHAR"
## [31] "dose_caseloc" "age_1fftp_fd"
## [33] "age_menopause_1yrbf_fd" "num_preg"
## [35] "BMI_age18" "BMI_dx"
## [37] "BMI_ref" "pr1"
## [39] "age_menarche"
attach(ph.df)
sum(cc.x != cc.y, na.rm = TRUE)
## [1] 0
sum(is.na(cc.x)) # to be preserved
## [1] 0
sum(is.na(cc.y)) # to be removed
## [1] 5
sum(sub_dx_age.x != sub_dx_age.y, na.rm = TRUE)
## [1] 0
sum(is.na(sub_dx_age.x)) # to be preserved
## [1] 0
sum(is.na(sub_dx_age.y)) # to be removed
## [1] 5
sum(offset.x != offset.y, na.rm = TRUE)
## [1] 0
sum(is.na(offset.x)) # to be preserved
## [1] 0
sum(is.na(offset.y)) # to be removed
## [1] 5
sum(hormone.x != hormone.y, na.rm = TRUE)
## [1] 0
sum(is.na(hormone.x)) # to be preserved
## [1] 1
sum(is.na(hormone.y)) # to be removed
## [1] 6
sum(is.na(chemo)) # to be removed, substituted by CMF
## [1] 0
sum(Eigen_1.x != Eigen_1.y, na.rm = TRUE)
## [1] 0
sum(is.na(Eigen_1.x)) # to be preserved
## [1] 0
sum(is.na(Eigen_1.y)) # to be removed
## [1] 5
sum(Eigen_2.x != Eigen_2.y, na.rm = TRUE)
## [1] 0
sum(is.na(Eigen_2.x)) # to be preserved
## [1] 0
sum(is.na(Eigen_2.y)) # to be removed
## [1] 5
sum(Eigen_3.x != Eigen_3.y, na.rm = TRUE)
## [1] 0
sum(is.na(Eigen_3.x)) # to be preserved
## [1] 0
sum(is.na(Eigen_3.y)) # to be removed
## [1] 5
sum(Eigen_4.x != Eigen_4.y, na.rm = TRUE)
## [1] 0
sum(is.na(Eigen_4.x)) # to be preserved
## [1] 0
sum(is.na(Eigen_4.y)) # to be removed
## [1] 5
sum(Eigen_5.x != Eigen_5.y, na.rm = TRUE)
## [1] 0
sum(is.na(Eigen_5.x)) # to be preserved
## [1] 0
sum(is.na(Eigen_5.y)) # to be removed
## [1] 5
sum(is.na(family_history)) # to be preserved
## [1] 5
sum(is.na(refage)) # to be preserved
## [1] 5
sum(is.na(rstime)) # to be preserved
## [1] 5
sum(is.na(stage_fd)) # to be preserved
## [1] 5
sum(is.na(er1)) # to be preserved
## [1] 155
sum(is.na(histo1_cat)) # to be preserved
## [1] 5
sum(is.na(CMF)) # to be preserved
## [1] 5
sum(is.na(XRTBrCHAR)) # to be preserved
## [1] 5
sum(is.na(dose_caseloc)) # to be preserved
## [1] 8
sum(is.na(age_1fftp_fd)) # to be preserved
## [1] 5
sum(is.na(age_menopause_1yrbf_fd)) # to be preserved
## [1] 8
sum(is.na(num_preg)) # to be preserved
## [1] 5
sum(is.na(BMI_age18)) # to be preserved
## [1] 7
sum(is.na(BMI_dx)) # to be preserved
## [1] 8
sum(is.na(BMI_ref)) # to be preserved
## [1] 8
sum(is.na(pr1)) # to be preserved
## [1] 215
sum(is.na(age_menarche)) # to be preserved
## [1] 9
detach(ph.df)
ph.df <- ph.df %>%
select(
cc = cc.x,
age_dx = sub_dx_age.x,
age_ref = refage,
rstime = rstime,
offset = offset.x,
Eigen_1 = Eigen_1.x,
Eigen_2 = Eigen_2.x,
Eigen_3 = Eigen_3.x,
Eigen_4 = Eigen_4.x,
Eigen_5 = Eigen_5.x,
stage = stage_fd,
er1 = er1,
pr1 = pr1,
hist_cat = histo1_cat,
hormone = hormone.x,
chemo_cat = CMF,
br_xray = XRTBrCHAR,
br_xray_dose = dose_caseloc,
age_menarche = age_menarche,
age_1st_ftp = age_1fftp_fd,
age_menopause = age_menopause_1yrbf_fd,
num_preg = num_preg,
BMI_age18 = BMI_age18,
BMI_dx = BMI_dx,
BMI_ref = BMI_ref)
rm(pheno.df, covar.df)
save.image(file="data/s05_reshape_data_feb2016_tmp.RData")
sessionInfo()
## R version 3.2.3 (2015-12-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Scientific Linux release 6.7 (Carbon)
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_2.1.0 stringr_1.0.0 dplyr_0.4.3 tidyr_0.4.1 knitr_1.12.3
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.4 magrittr_1.5 munsell_0.4.3 colorspace_1.2-6
## [5] R6_2.1.2 plyr_1.8.3 tools_3.2.3 parallel_3.2.3
## [9] grid_3.2.3 gtable_0.2.0 DBI_0.3.1 htmltools_0.3.5
## [13] yaml_2.1.13 lazyeval_0.1.10 assertthat_0.1 digest_0.6.9
## [17] reshape2_1.4.1 formatR_1.3 evaluate_0.8.3 rmarkdown_0.9.5
## [21] labeling_0.3 stringi_1.0-1 scales_0.4.0
Sys.time()
## [1] "2016-06-02 21:35:32 BST"